The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netlink/netlink_io.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2021 Ng Peng Nam Sean
    5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 #include <sys/param.h>
   32 #include <sys/ck.h>
   33 #include <sys/lock.h>
   34 #include <sys/malloc.h>
   35 #include <sys/mbuf.h>
   36 #include <sys/mutex.h>
   37 #include <sys/socket.h>
   38 #include <sys/socketvar.h>
   39 #include <sys/syslog.h>
   40 
   41 #include <netlink/netlink.h>
   42 #include <netlink/netlink_ctl.h>
   43 #include <netlink/netlink_linux.h>
   44 #include <netlink/netlink_var.h>
   45 
   46 #define DEBUG_MOD_NAME  nl_io
   47 #define DEBUG_MAX_LEVEL LOG_DEBUG3
   48 #include <netlink/netlink_debug.h>
   49 _DECLARE_DEBUG(LOG_DEBUG);
   50 
   51 /*
   52  * The logic below provide a p2p interface for receiving and
   53  * sending netlink data between the kernel and userland.
   54  */
   55 
   56 static const struct sockaddr_nl _nl_empty_src = {
   57         .nl_len = sizeof(struct sockaddr_nl),
   58         .nl_family = PF_NETLINK,
   59         .nl_pid = 0 /* comes from the kernel */
   60 };
   61 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
   62 
   63 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
   64 
   65 
   66 static void
   67 queue_push(struct nl_io_queue *q, struct mbuf *mq)
   68 {
   69         while (mq != NULL) {
   70                 struct mbuf *m = mq;
   71                 mq = mq->m_nextpkt;
   72                 m->m_nextpkt = NULL;
   73 
   74                 q->length += m_length(m, NULL);
   75                 STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
   76         }
   77 }
   78 
   79 static void
   80 queue_push_head(struct nl_io_queue *q, struct mbuf *m)
   81 {
   82         MPASS(m->m_nextpkt == NULL);
   83 
   84         q->length += m_length(m, NULL);
   85         STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt);
   86 }
   87 
   88 static struct mbuf *
   89 queue_pop(struct nl_io_queue *q)
   90 {
   91         if (!STAILQ_EMPTY(&q->head)) {
   92                 struct mbuf *m = STAILQ_FIRST(&q->head);
   93                 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
   94                 m->m_nextpkt = NULL;
   95                 q->length -= m_length(m, NULL);
   96 
   97                 return (m);
   98         }
   99         return (NULL);
  100 }
  101 
  102 static struct mbuf *
  103 queue_head(const struct nl_io_queue *q)
  104 {
  105         return (STAILQ_FIRST(&q->head));
  106 }
  107 
  108 static inline bool
  109 queue_empty(const struct nl_io_queue *q)
  110 {
  111         return (q->length == 0);
  112 }
  113 
  114 static void
  115 queue_free(struct nl_io_queue *q)
  116 {
  117         while (!STAILQ_EMPTY(&q->head)) {
  118                 struct mbuf *m = STAILQ_FIRST(&q->head);
  119                 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
  120                 m->m_nextpkt = NULL;
  121                 m_freem(m);
  122         }
  123         q->length = 0;
  124 }
  125 
  126 
  127 static void
  128 nl_schedule_taskqueue(struct nlpcb *nlp)
  129 {
  130         if (!nlp->nl_task_pending) {
  131                 nlp->nl_task_pending = true;
  132                 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
  133                 NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
  134         } else {
  135                 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
  136         }
  137 }
  138 
  139 int
  140 nl_receive_async(struct mbuf *m, struct socket *so)
  141 {
  142         struct nlpcb *nlp = sotonlpcb(so);
  143         int error = 0;
  144 
  145         m->m_nextpkt = NULL;
  146 
  147         NLP_LOCK(nlp);
  148 
  149         if ((__predict_true(nlp->nl_active))) {
  150                 sbappend(&so->so_snd, m, 0);
  151                 NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
  152                 nl_schedule_taskqueue(nlp);
  153         } else {
  154                 NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
  155                     m_length(m, NULL));
  156                 m_free(m);
  157                 error = EINVAL;
  158         }
  159 
  160         NLP_UNLOCK(nlp);
  161 
  162         return (error);
  163 }
  164 
  165 static bool
  166 tx_check_locked(struct nlpcb *nlp)
  167 {
  168         if (queue_empty(&nlp->tx_queue))
  169                 return (true);
  170 
  171         /*
  172          * Check if something can be moved from the internal TX queue
  173          * to the socket queue.
  174          */
  175 
  176         bool appended = false;
  177         struct sockbuf *sb = &nlp->nl_socket->so_rcv;
  178         SOCKBUF_LOCK(sb);
  179 
  180         while (true) {
  181                 struct mbuf *m = queue_head(&nlp->tx_queue);
  182                 if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) {
  183                         /* appended successfully */
  184                         queue_pop(&nlp->tx_queue);
  185                         appended = true;
  186                 } else
  187                         break;
  188         }
  189 
  190         SOCKBUF_UNLOCK(sb);
  191 
  192         if (appended)
  193                 sorwakeup(nlp->nl_socket);
  194 
  195         return (queue_empty(&nlp->tx_queue));
  196 }
  197 
  198 static bool
  199 nl_process_received_one(struct nlpcb *nlp)
  200 {
  201         bool reschedule = false;
  202 
  203         NLP_LOCK(nlp);
  204         nlp->nl_task_pending = false;
  205 
  206         if (!tx_check_locked(nlp)) {
  207                 /* TX overflow queue still not empty, ignore RX */
  208                 NLP_UNLOCK(nlp);
  209                 return (false);
  210         }
  211 
  212         if (queue_empty(&nlp->rx_queue)) {
  213                 /*
  214                  * Grab all data we have from the socket TX queue
  215                  * and store it the internal queue, so it can be worked on
  216                  * w/o holding socket lock.
  217                  */
  218                 struct sockbuf *sb = &nlp->nl_socket->so_snd;
  219 
  220                 SOCKBUF_LOCK(sb);
  221                 unsigned int avail = sbavail(sb);
  222                 if (avail > 0) {
  223                         NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
  224                         queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
  225                 }
  226                 SOCKBUF_UNLOCK(sb);
  227         } else {
  228                 /* Schedule another pass to read from the socket queue */
  229                 reschedule = true;
  230         }
  231 
  232         int prev_hiwat = nlp->tx_queue.hiwat;
  233         NLP_UNLOCK(nlp);
  234 
  235         while (!queue_empty(&nlp->rx_queue)) {
  236                 struct mbuf *m = queue_pop(&nlp->rx_queue);
  237 
  238                 m = nl_process_mbuf(m, nlp);
  239                 if (m != NULL) {
  240                         queue_push_head(&nlp->rx_queue, m);
  241                         reschedule = false;
  242                         break;
  243                 }
  244         }
  245         if (nlp->tx_queue.hiwat > prev_hiwat) {
  246                 NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
  247 
  248         }
  249 
  250         return (reschedule);
  251 }
  252 
  253 static void
  254 nl_process_received(struct nlpcb *nlp)
  255 {
  256         NL_LOG(LOG_DEBUG3, "taskqueue called");
  257 
  258         while (nl_process_received_one(nlp))
  259                 ;
  260 }
  261 
  262 void
  263 nl_init_io(struct nlpcb *nlp)
  264 {
  265         STAILQ_INIT(&nlp->rx_queue.head);
  266         STAILQ_INIT(&nlp->tx_queue.head);
  267 }
  268 
  269 void
  270 nl_free_io(struct nlpcb *nlp)
  271 {
  272         queue_free(&nlp->rx_queue);
  273         queue_free(&nlp->tx_queue);
  274 }
  275 
  276 /*
  277  * Called after some data have been read from the socket.
  278  */
  279 void
  280 nl_on_transmit(struct nlpcb *nlp)
  281 {
  282         NLP_LOCK(nlp);
  283 
  284         struct socket *so = nlp->nl_socket;
  285         if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
  286                 unsigned long dropped_bytes = nlp->nl_dropped_bytes;
  287                 unsigned long dropped_messages = nlp->nl_dropped_messages;
  288                 nlp->nl_dropped_bytes = 0;
  289                 nlp->nl_dropped_messages = 0;
  290 
  291                 struct sockbuf *sb = &so->so_rcv;
  292                 NLP_LOG(LOG_DEBUG, nlp,
  293                     "socket RX overflowed, %lu messages (%lu bytes) dropped. "
  294                     "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
  295                     sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
  296                 /* TODO: send netlink message */
  297         }
  298 
  299         nl_schedule_taskqueue(nlp);
  300         NLP_UNLOCK(nlp);
  301 }
  302 
  303 void
  304 nl_taskqueue_handler(void *_arg, int pending)
  305 {
  306         struct nlpcb *nlp = (struct nlpcb *)_arg;
  307 
  308         CURVNET_SET(nlp->nl_socket->so_vnet);
  309         nl_process_received(nlp);
  310         CURVNET_RESTORE();
  311 }
  312 
  313 static __noinline void
  314 queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
  315 {
  316         queue_push(&nlp->tx_queue, m);
  317         nlp->nl_tx_blocked = true;
  318 
  319         if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
  320                 nlp->tx_queue.hiwat = nlp->tx_queue.length;
  321 }
  322 
  323 /*
  324  * Tries to send @m to the socket @nlp.
  325  *
  326  * @m: mbuf(s) to send to. Consumed in any case.
  327  * @nlp: socket to send to
  328  * @cnt: number of messages in @m
  329  * @io_flags: combination of NL_IOF_* flags
  330  *
  331  * Returns true on success.
  332  * If no queue overrunes happened, wakes up socket owner.
  333  */
  334 bool
  335 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
  336 {
  337         bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
  338         bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
  339         bool result = true;
  340 
  341         IF_DEBUG_LEVEL(LOG_DEBUG2) {
  342                 struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
  343                 NLP_LOG(LOG_DEBUG2, nlp,
  344                     "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
  345                     m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
  346                     io_flags);
  347         }
  348 
  349         if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
  350                 m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
  351                 if (m == NULL)
  352                         return (false);
  353         }
  354 
  355         NLP_LOCK(nlp);
  356 
  357         if (__predict_false(nlp->nl_socket == NULL)) {
  358                 NLP_UNLOCK(nlp);
  359                 m_freem(m);
  360                 return (false);
  361         }
  362 
  363         if (!queue_empty(&nlp->tx_queue)) {
  364                 if (ignore_limits) {
  365                         queue_push_tx(nlp, m);
  366                 } else {
  367                         m_free(m);
  368                         result = false;
  369                 }
  370                 NLP_UNLOCK(nlp);
  371                 return (result);
  372         }
  373 
  374         struct socket *so = nlp->nl_socket;
  375         if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) {
  376                 sorwakeup(so);
  377                 NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
  378         } else {
  379                 if (ignore_limits) {
  380                         queue_push_tx(nlp, m);
  381                 } else {
  382                         /*
  383                          * Store dropped data so it can be reported
  384                          * on the next read
  385                          */
  386                         nlp->nl_dropped_bytes += m_length(m, NULL);
  387                         nlp->nl_dropped_messages += num_messages;
  388                         NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
  389                             (unsigned long)nlp->nl_dropped_messages, num_messages,
  390                             (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
  391                         soroverflow(so);
  392                         m_freem(m);
  393                         result = false;
  394                 }
  395         }
  396         NLP_UNLOCK(nlp);
  397 
  398         return (result);
  399 }
  400 
  401 static int
  402 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
  403     struct nlpcb *nlp, struct nl_pstate *npt)
  404 {
  405         nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
  406         int error = 0;
  407 
  408         NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
  409             hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
  410             hdr->nlmsg_pid);
  411 
  412         if (__predict_false(hdr->nlmsg_len > remaining_length)) {
  413                 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
  414                     hdr->nlmsg_len, remaining_length);
  415                 return (EINVAL);
  416         } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
  417                 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
  418                 return (EINVAL);
  419         }
  420         /* Stamp each message with sender pid */
  421         hdr->nlmsg_pid = nlp->nl_port;
  422 
  423         npt->hdr = hdr;
  424 
  425         if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
  426                 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
  427                    hdr->nlmsg_type);
  428 
  429                 if (nlp->nl_linux && linux_netlink_p != NULL) {
  430                         struct nlmsghdr *hdr_orig = hdr;
  431                         hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
  432                         if (hdr == NULL) {
  433                                 npt->hdr = hdr_orig;
  434                                 if (hdr->nlmsg_flags & NLM_F_ACK)
  435                                         nlmsg_ack(nlp, EAGAIN, hdr, npt);
  436                                 return (0);
  437                         }
  438                 }
  439                 error = handler(hdr, npt);
  440                 NL_LOG(LOG_DEBUG2, "retcode: %d", error);
  441         }
  442         if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
  443                 if (!npt->nw->suppress_ack) {
  444                         NL_LOG(LOG_DEBUG3, "ack");
  445                         nlmsg_ack(nlp, error, hdr, npt);
  446                 }
  447         }
  448 
  449         return (0);
  450 }
  451 
  452 static void
  453 npt_clear(struct nl_pstate *npt)
  454 {
  455         lb_clear(&npt->lb);
  456         npt->error = 0;
  457         npt->err_msg = NULL;
  458         npt->err_off = 0;
  459         npt->hdr = NULL;
  460         npt->nw->suppress_ack = false;
  461 }
  462 
  463 /*
  464  * Processes an incoming packet, which can contain multiple netlink messages
  465  */
  466 static struct mbuf *
  467 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
  468 {
  469         int offset, buffer_length;
  470         struct nlmsghdr *hdr;
  471         char *buffer;
  472         int error;
  473 
  474         NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
  475 
  476         struct nl_writer nw = {};
  477         if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
  478                 m_freem(m);
  479                 NL_LOG(LOG_DEBUG, "error allocating socket writer");
  480                 return (NULL);
  481         }
  482 
  483         nlmsg_ignore_limit(&nw);
  484         /* TODO: alloc this buf once for nlp */
  485         int data_length = m_length(m, NULL);
  486         buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
  487         if (nlp->nl_linux)
  488                 buffer_length += roundup2(data_length, 8);
  489         buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
  490         if (buffer == NULL) {
  491                 m_freem(m);
  492                 nlmsg_flush(&nw);
  493                 NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
  494                     buffer_length);
  495                 return (NULL);
  496         }
  497         m_copydata(m, 0, data_length, buffer);
  498 
  499         struct nl_pstate npt = {
  500                 .nlp = nlp,
  501                 .lb.base = &buffer[roundup2(data_length, 8)],
  502                 .lb.size = buffer_length - roundup2(data_length, 8),
  503                 .nw = &nw,
  504                 .strict = nlp->nl_flags & NLF_STRICT,
  505         };
  506 
  507         for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
  508                 hdr = (struct nlmsghdr *)&buffer[offset];
  509                 /* Save length prior to calling handler */
  510                 int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
  511                 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
  512                 npt_clear(&npt);
  513                 error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
  514                 offset += msglen;
  515                 if (__predict_false(error != 0 || nlp->nl_tx_blocked))
  516                         break;
  517         }
  518         NL_LOG(LOG_DEBUG3, "packet parsing done");
  519         free(buffer, M_NETLINK);
  520         nlmsg_flush(&nw);
  521 
  522         if (nlp->nl_tx_blocked) {
  523                 NLP_LOCK(nlp);
  524                 nlp->nl_tx_blocked = false;
  525                 NLP_UNLOCK(nlp);
  526                 m_adj(m, offset);
  527                 return (m);
  528         } else {
  529                 m_freem(m);
  530                 return (NULL);
  531         }
  532 }

Cache object: f43cc13ca9fc05e97929d4076fa5457a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.