The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_usrreq.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uipc_usrreq.c,v 1.119.4.3 2009/11/08 21:47:45 snj Exp $        */
    2 
    3 /*-
    4  * Copyright (c) 1998, 2000, 2004, 2008, 2009 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
    9  * NASA Ames Research Center, and by Andrew Doran.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   30  * POSSIBILITY OF SUCH DAMAGE.
   31  */
   32 
   33 /*
   34  * Copyright (c) 1982, 1986, 1989, 1991, 1993
   35  *      The Regents of the University of California.  All rights reserved.
   36  *
   37  * Redistribution and use in source and binary forms, with or without
   38  * modification, are permitted provided that the following conditions
   39  * are met:
   40  * 1. Redistributions of source code must retain the above copyright
   41  *    notice, this list of conditions and the following disclaimer.
   42  * 2. Redistributions in binary form must reproduce the above copyright
   43  *    notice, this list of conditions and the following disclaimer in the
   44  *    documentation and/or other materials provided with the distribution.
   45  * 3. Neither the name of the University nor the names of its contributors
   46  *    may be used to endorse or promote products derived from this software
   47  *    without specific prior written permission.
   48  *
   49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   59  * SUCH DAMAGE.
   60  *
   61  *      @(#)uipc_usrreq.c       8.9 (Berkeley) 5/14/95
   62  */
   63 
   64 /*
   65  * Copyright (c) 1997 Christopher G. Demetriou.  All rights reserved.
   66  *
   67  * Redistribution and use in source and binary forms, with or without
   68  * modification, are permitted provided that the following conditions
   69  * are met:
   70  * 1. Redistributions of source code must retain the above copyright
   71  *    notice, this list of conditions and the following disclaimer.
   72  * 2. Redistributions in binary form must reproduce the above copyright
   73  *    notice, this list of conditions and the following disclaimer in the
   74  *    documentation and/or other materials provided with the distribution.
   75  * 3. All advertising materials mentioning features or use of this software
   76  *    must display the following acknowledgement:
   77  *      This product includes software developed by the University of
   78  *      California, Berkeley and its contributors.
   79  * 4. Neither the name of the University nor the names of its contributors
   80  *    may be used to endorse or promote products derived from this software
   81  *    without specific prior written permission.
   82  *
   83  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   93  * SUCH DAMAGE.
   94  *
   95  *      @(#)uipc_usrreq.c       8.9 (Berkeley) 5/14/95
   96  */
   97 
   98 #include <sys/cdefs.h>
   99 __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.119.4.3 2009/11/08 21:47:45 snj Exp $");
  100 
  101 #include <sys/param.h>
  102 #include <sys/systm.h>
  103 #include <sys/proc.h>
  104 #include <sys/filedesc.h>
  105 #include <sys/domain.h>
  106 #include <sys/protosw.h>
  107 #include <sys/socket.h>
  108 #include <sys/socketvar.h>
  109 #include <sys/unpcb.h>
  110 #include <sys/un.h>
  111 #include <sys/namei.h>
  112 #include <sys/vnode.h>
  113 #include <sys/file.h>
  114 #include <sys/stat.h>
  115 #include <sys/mbuf.h>
  116 #include <sys/kauth.h>
  117 #include <sys/kmem.h>
  118 #include <sys/atomic.h>
  119 #include <sys/uidinfo.h>
  120 #include <sys/kernel.h>
  121 #include <sys/kthread.h>
  122 
  123 /*
  124  * Unix communications domain.
  125  *
  126  * TODO:
  127  *      SEQPACKET, RDM
  128  *      rethink name space problems
  129  *      need a proper out-of-band
  130  *
  131  * Notes on locking:
  132  *
  133  * The generic rules noted in uipc_socket2.c apply.  In addition:
  134  *
  135  * o We have a global lock, uipc_lock.
  136  *
  137  * o All datagram sockets are locked by uipc_lock.
  138  *
  139  * o For stream socketpairs, the two endpoints are created sharing the same
  140  *   independent lock.  Sockets presented to PRU_CONNECT2 must already have
  141  *   matching locks.
  142  *
  143  * o Stream sockets created via socket() start life with their own
  144  *   independent lock.
  145  * 
  146  * o Stream connections to a named endpoint are slightly more complicated.
  147  *   Sockets that have called listen() have their lock pointer mutated to
  148  *   the global uipc_lock.  When establishing a connection, the connecting
  149  *   socket also has its lock mutated to uipc_lock, which matches the head
  150  *   (listening socket).  We create a new socket for accept() to return, and
  151  *   that also shares the head's lock.  Until the connection is completely
  152  *   done on both ends, all three sockets are locked by uipc_lock.  Once the
  153  *   connection is complete, the association with the head's lock is broken.
  154  *   The connecting socket and the socket returned from accept() have their
  155  *   lock pointers mutated away from uipc_lock, and back to the connecting
  156  *   socket's original, independent lock.  The head continues to be locked
  157  *   by uipc_lock.
  158  *
  159  * o If uipc_lock is determined to be a significant source of contention,
  160  *   it could easily be hashed out.  It is difficult to simply make it an
  161  *   independent lock because of visibility / garbage collection issues:
  162  *   if a socket has been associated with a lock at any point, that lock
  163  *   must remain valid until the socket is no longer visible in the system.
  164  *   The lock must not be freed or otherwise destroyed until any sockets
  165  *   that had referenced it have also been destroyed.
  166  */
  167 const struct sockaddr_un sun_noname = {
  168         .sun_len = sizeof(sun_noname),
  169         .sun_family = AF_LOCAL,
  170 };
  171 ino_t   unp_ino;                        /* prototype for fake inode numbers */
  172 
  173 struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *);
  174 static void unp_mark(file_t *);
  175 static void unp_scan(struct mbuf *, void (*)(file_t *), int);
  176 static void unp_discard_now(file_t *);
  177 static void unp_discard_later(file_t *);
  178 static void unp_thread(void *);
  179 static void unp_thread_kick(void);
  180 static kmutex_t *uipc_lock;
  181 
  182 static kcondvar_t unp_thread_cv;
  183 static lwp_t *unp_thread_lwp;
  184 static SLIST_HEAD(,file) unp_thread_discard;
  185 static int unp_defer;
  186 
  187 /*
  188  * Initialize Unix protocols.
  189  */
  190 void
  191 uipc_init(void)
  192 {
  193         int error;
  194 
  195         uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
  196         cv_init(&unp_thread_cv, "unpgc");
  197 
  198         error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
  199             NULL, &unp_thread_lwp, "unpgc");
  200         if (error != 0)
  201                 panic("uipc_init %d", error);
  202 }
  203 
  204 /*
  205  * A connection succeeded: disassociate both endpoints from the head's
  206  * lock, and make them share their own lock.  There is a race here: for
  207  * a very brief time one endpoint will be locked by a different lock
  208  * than the other end.  However, since the current thread holds the old
  209  * lock (the listening socket's lock, the head) access can still only be
  210  * made to one side of the connection.
  211  */
  212 static void
  213 unp_setpeerlocks(struct socket *so, struct socket *so2)
  214 {
  215         struct unpcb *unp;
  216         kmutex_t *lock;
  217 
  218         KASSERT(solocked2(so, so2));
  219 
  220         /*
  221          * Bail out if either end of the socket is not yet fully
  222          * connected or accepted.  We only break the lock association
  223          * with the head when the pair of sockets stand completely
  224          * on their own.
  225          */
  226         if (so->so_head != NULL || so2->so_head != NULL)
  227                 return;
  228 
  229         /*
  230          * Drop references to old lock.  A third reference (from the
  231          * queue head) must be held as we still hold its lock.  Bonus:
  232          * we don't need to worry about garbage collecting the lock.
  233          */
  234         lock = so->so_lock;
  235         KASSERT(lock == uipc_lock);
  236         mutex_obj_free(lock);
  237         mutex_obj_free(lock);
  238 
  239         /*
  240          * Grab stream lock from the initiator and share between the two
  241          * endpoints.  Issue memory barrier to ensure all modifications
  242          * become globally visible before the lock change.  so2 is
  243          * assumed not to have a stream lock, because it was created
  244          * purely for the server side to accept this connection and
  245          * started out life using the domain-wide lock.
  246          */
  247         unp = sotounpcb(so);
  248         KASSERT(unp->unp_streamlock != NULL);
  249         KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
  250         lock = unp->unp_streamlock;
  251         unp->unp_streamlock = NULL;
  252         mutex_obj_hold(lock);
  253         membar_exit();
  254         /*
  255          * possible race if lock is not held - see comment in
  256          * uipc_usrreq(PRU_ACCEPT).
  257          */
  258         KASSERT(mutex_owned(lock));
  259         solockreset(so, lock);
  260         solockreset(so2, lock);
  261 }
  262 
  263 /*
  264  * Reset a socket's lock back to the domain-wide lock.
  265  */
  266 static void
  267 unp_resetlock(struct socket *so)
  268 {
  269         kmutex_t *olock, *nlock;
  270         struct unpcb *unp;
  271 
  272         KASSERT(solocked(so));
  273 
  274         olock = so->so_lock;
  275         nlock = uipc_lock;
  276         if (olock == nlock)
  277                 return;
  278         unp = sotounpcb(so);
  279         KASSERT(unp->unp_streamlock == NULL);
  280         unp->unp_streamlock = olock;
  281         mutex_obj_hold(nlock);
  282         mutex_enter(nlock);
  283         solockreset(so, nlock);
  284         mutex_exit(olock);
  285 }
  286 
  287 static void
  288 unp_free(struct unpcb *unp)
  289 {
  290 
  291         if (unp->unp_addr)
  292                 free(unp->unp_addr, M_SONAME);
  293         if (unp->unp_streamlock != NULL)
  294                 mutex_obj_free(unp->unp_streamlock);
  295         free(unp, M_PCB);
  296 }
  297 
  298 int
  299 unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp,
  300         struct lwp *l)
  301 {
  302         struct socket *so2;
  303         const struct sockaddr_un *sun;
  304 
  305         so2 = unp->unp_conn->unp_socket;
  306 
  307         KASSERT(solocked(so2));
  308 
  309         if (unp->unp_addr)
  310                 sun = unp->unp_addr;
  311         else
  312                 sun = &sun_noname;
  313         if (unp->unp_conn->unp_flags & UNP_WANTCRED)
  314                 control = unp_addsockcred(l, control);
  315         if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
  316             control) == 0) {
  317                 so2->so_rcv.sb_overflowed++;
  318                 unp_dispose(control);
  319                 m_freem(control);
  320                 m_freem(m);
  321                 return (ENOBUFS);
  322         } else {
  323                 sorwakeup(so2);
  324                 return (0);
  325         }
  326 }
  327 
  328 void
  329 unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr)
  330 {
  331         const struct sockaddr_un *sun;
  332         struct unpcb *unp;
  333         bool ext;
  334 
  335         KASSERT(solocked(so));
  336         unp = sotounpcb(so);
  337         ext = false;
  338 
  339         for (;;) {
  340                 sun = NULL;
  341                 if (peeraddr) {
  342                         if (unp->unp_conn && unp->unp_conn->unp_addr)
  343                                 sun = unp->unp_conn->unp_addr;
  344                 } else {
  345                         if (unp->unp_addr)
  346                                 sun = unp->unp_addr;
  347                 }
  348                 if (sun == NULL)
  349                         sun = &sun_noname;
  350                 nam->m_len = sun->sun_len;
  351                 if (nam->m_len > MLEN && !ext) {
  352                         sounlock(so);
  353                         MEXTMALLOC(nam, MAXPATHLEN * 2, M_WAITOK);
  354                         solock(so);
  355                         ext = true;
  356                 } else {
  357                         KASSERT(nam->m_len <= MAXPATHLEN * 2);
  358                         memcpy(mtod(nam, void *), sun, (size_t)nam->m_len);
  359                         break;
  360                 }
  361         }
  362 }
  363 
  364 /*ARGSUSED*/
  365 int
  366 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
  367         struct mbuf *control, struct lwp *l)
  368 {
  369         struct unpcb *unp = sotounpcb(so);
  370         struct socket *so2;
  371         struct proc *p;
  372         u_int newhiwat;
  373         int error = 0;
  374 
  375         if (req == PRU_CONTROL)
  376                 return (EOPNOTSUPP);
  377 
  378 #ifdef DIAGNOSTIC
  379         if (req != PRU_SEND && req != PRU_SENDOOB && control)
  380                 panic("uipc_usrreq: unexpected control mbuf");
  381 #endif
  382         p = l ? l->l_proc : NULL;
  383         if (req != PRU_ATTACH) {
  384                 if (unp == 0) {
  385                         error = EINVAL;
  386                         goto release;
  387                 }
  388                 KASSERT(solocked(so));
  389         }
  390 
  391         switch (req) {
  392 
  393         case PRU_ATTACH:
  394                 if (unp != 0) {
  395                         error = EISCONN;
  396                         break;
  397                 }
  398                 error = unp_attach(so);
  399                 break;
  400 
  401         case PRU_DETACH:
  402                 unp_detach(unp);
  403                 break;
  404 
  405         case PRU_BIND:
  406                 KASSERT(l != NULL);
  407                 error = unp_bind(so, nam, l);
  408                 break;
  409 
  410         case PRU_LISTEN:
  411                 /*
  412                  * If the socket can accept a connection, it must be
  413                  * locked by uipc_lock.
  414                  */
  415                 unp_resetlock(so);
  416                 if (unp->unp_vnode == 0)
  417                         error = EINVAL;
  418                 break;
  419 
  420         case PRU_CONNECT:
  421                 KASSERT(l != NULL);
  422                 error = unp_connect(so, nam, l);
  423                 break;
  424 
  425         case PRU_CONNECT2:
  426                 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2);
  427                 break;
  428 
  429         case PRU_DISCONNECT:
  430                 unp_disconnect(unp);
  431                 break;
  432 
  433         case PRU_ACCEPT:
  434                 KASSERT(so->so_lock == uipc_lock);
  435                 /*
  436                  * Mark the initiating STREAM socket as connected *ONLY*
  437                  * after it's been accepted.  This prevents a client from
  438                  * overrunning a server and receiving ECONNREFUSED.
  439                  */
  440                 if (unp->unp_conn == NULL)
  441                         break;
  442                 so2 = unp->unp_conn->unp_socket;
  443                 if (so2->so_state & SS_ISCONNECTING) {
  444                         KASSERT(solocked2(so, so->so_head));
  445                         KASSERT(solocked2(so2, so->so_head));
  446                         soisconnected(so2);
  447                 }
  448                 /*
  449                  * If the connection is fully established, break the
  450                  * association with uipc_lock and give the connected
  451                  * pair a seperate lock to share.
  452                  * There is a race here: sotounpcb(so2)->unp_streamlock
  453                  * is not locked, so when changing so2->so_lock
  454                  * another thread can grab it while so->so_lock is still
  455                  * pointing to the (locked) uipc_lock.
  456                  * this should be harmless, exept that this makes
  457                  * solocked2() and solocked() unreliable.
  458                  * Another problem is that unp_setaddr() expects the
  459                  * the socket locked. Grabing sotounpcb(so2)->unp_streamlock
  460                  * fixes both issues.
  461                  */
  462                 mutex_enter(sotounpcb(so2)->unp_streamlock);
  463                 unp_setpeerlocks(so2, so);
  464                 /*
  465                  * Only now return peer's address, as we may need to
  466                  * block in order to allocate memory.
  467                  *
  468                  * XXX Minor race: connection can be broken while
  469                  * lock is dropped in unp_setaddr().  We will return
  470                  * error == 0 and sun_noname as the peer address.
  471                  */
  472                 unp_setaddr(so, nam, true);
  473                 /* so_lock now points to unp_streamlock */
  474                 mutex_exit(so2->so_lock);
  475                 break;
  476 
  477         case PRU_SHUTDOWN:
  478                 socantsendmore(so);
  479                 unp_shutdown(unp);
  480                 break;
  481 
  482         case PRU_RCVD:
  483                 switch (so->so_type) {
  484 
  485                 case SOCK_DGRAM:
  486                         panic("uipc 1");
  487                         /*NOTREACHED*/
  488 
  489                 case SOCK_STREAM:
  490 #define rcv (&so->so_rcv)
  491 #define snd (&so2->so_snd)
  492                         if (unp->unp_conn == 0)
  493                                 break;
  494                         so2 = unp->unp_conn->unp_socket;
  495                         KASSERT(solocked2(so, so2));
  496                         /*
  497                          * Adjust backpressure on sender
  498                          * and wakeup any waiting to write.
  499                          */
  500                         snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
  501                         unp->unp_mbcnt = rcv->sb_mbcnt;
  502                         newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
  503                         (void)chgsbsize(so2->so_uidinfo,
  504                             &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
  505                         unp->unp_cc = rcv->sb_cc;
  506                         sowwakeup(so2);
  507 #undef snd
  508 #undef rcv
  509                         break;
  510 
  511                 default:
  512                         panic("uipc 2");
  513                 }
  514                 break;
  515 
  516         case PRU_SEND:
  517                 /*
  518                  * Note: unp_internalize() rejects any control message
  519                  * other than SCM_RIGHTS, and only allows one.  This
  520                  * has the side-effect of preventing a caller from
  521                  * forging SCM_CREDS.
  522                  */
  523                 if (control) {
  524                         sounlock(so);
  525                         error = unp_internalize(&control);
  526                         solock(so);
  527                         if (error != 0) {
  528                                 m_freem(control);
  529                                 m_freem(m);
  530                                 break;
  531                         }
  532                 }
  533                 switch (so->so_type) {
  534 
  535                 case SOCK_DGRAM: {
  536                         KASSERT(so->so_lock == uipc_lock);
  537                         if (nam) {
  538                                 if ((so->so_state & SS_ISCONNECTED) != 0)
  539                                         error = EISCONN;
  540                                 else {
  541                                         /*
  542                                          * Note: once connected, the
  543                                          * socket's lock must not be
  544                                          * dropped until we have sent
  545                                          * the message and disconnected.
  546                                          * This is necessary to prevent
  547                                          * intervening control ops, like
  548                                          * another connection.
  549                                          */
  550                                         error = unp_connect(so, nam, l);
  551                                 }
  552                         } else {
  553                                 if ((so->so_state & SS_ISCONNECTED) == 0)
  554                                         error = ENOTCONN;
  555                         }
  556                         if (error) {
  557                                 unp_dispose(control);
  558                                 m_freem(control);
  559                                 m_freem(m);
  560                                 break;
  561                         }
  562                         KASSERT(p != NULL);
  563                         error = unp_output(m, control, unp, l);
  564                         if (nam)
  565                                 unp_disconnect(unp);
  566                         break;
  567                 }
  568 
  569                 case SOCK_STREAM:
  570 #define rcv (&so2->so_rcv)
  571 #define snd (&so->so_snd)
  572                         if (unp->unp_conn == NULL) {
  573                                 error = ENOTCONN;
  574                                 break;
  575                         }
  576                         so2 = unp->unp_conn->unp_socket;
  577                         KASSERT(solocked2(so, so2));
  578                         if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
  579                                 /*
  580                                  * Credentials are passed only once on
  581                                  * SOCK_STREAM.
  582                                  */
  583                                 unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
  584                                 control = unp_addsockcred(l, control);
  585                         }
  586                         /*
  587                          * Send to paired receive port, and then reduce
  588                          * send buffer hiwater marks to maintain backpressure.
  589                          * Wake up readers.
  590                          */
  591                         if (control) {
  592                                 if (sbappendcontrol(rcv, m, control) != 0)
  593                                         control = NULL;
  594                         } else
  595                                 sbappend(rcv, m);
  596                         snd->sb_mbmax -=
  597                             rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
  598                         unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
  599                         newhiwat = snd->sb_hiwat -
  600                             (rcv->sb_cc - unp->unp_conn->unp_cc);
  601                         (void)chgsbsize(so->so_uidinfo,
  602                             &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
  603                         unp->unp_conn->unp_cc = rcv->sb_cc;
  604                         sorwakeup(so2);
  605 #undef snd
  606 #undef rcv
  607                         if (control != NULL) {
  608                                 unp_dispose(control);
  609                                 m_freem(control);
  610                         }
  611                         break;
  612 
  613                 default:
  614                         panic("uipc 4");
  615                 }
  616                 break;
  617 
  618         case PRU_ABORT:
  619                 (void)unp_drop(unp, ECONNABORTED);
  620 
  621                 KASSERT(so->so_head == NULL);
  622 #ifdef DIAGNOSTIC
  623                 if (so->so_pcb == 0)
  624                         panic("uipc 5: drop killed pcb");
  625 #endif
  626                 unp_detach(unp);
  627                 break;
  628 
  629         case PRU_SENSE:
  630                 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
  631                 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
  632                         so2 = unp->unp_conn->unp_socket;
  633                         KASSERT(solocked2(so, so2));
  634                         ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
  635                 }
  636                 ((struct stat *) m)->st_dev = NODEV;
  637                 if (unp->unp_ino == 0)
  638                         unp->unp_ino = unp_ino++;
  639                 ((struct stat *) m)->st_atimespec =
  640                     ((struct stat *) m)->st_mtimespec =
  641                     ((struct stat *) m)->st_ctimespec = unp->unp_ctime;
  642                 ((struct stat *) m)->st_ino = unp->unp_ino;
  643                 return (0);
  644 
  645         case PRU_RCVOOB:
  646                 error = EOPNOTSUPP;
  647                 break;
  648 
  649         case PRU_SENDOOB:
  650                 m_freem(control);
  651                 m_freem(m);
  652                 error = EOPNOTSUPP;
  653                 break;
  654 
  655         case PRU_SOCKADDR:
  656                 unp_setaddr(so, nam, false);
  657                 break;
  658 
  659         case PRU_PEERADDR:
  660                 unp_setaddr(so, nam, true);
  661                 break;
  662 
  663         default:
  664                 panic("piusrreq");
  665         }
  666 
  667 release:
  668         return (error);
  669 }
  670 
  671 /*
  672  * Unix domain socket option processing.
  673  */
  674 int
  675 uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
  676 {
  677         struct unpcb *unp = sotounpcb(so);
  678         int optval = 0, error = 0;
  679 
  680         KASSERT(solocked(so));
  681 
  682         if (sopt->sopt_level != 0) {
  683                 error = ENOPROTOOPT;
  684         } else switch (op) {
  685 
  686         case PRCO_SETOPT:
  687                 switch (sopt->sopt_name) {
  688                 case LOCAL_CREDS:
  689                 case LOCAL_CONNWAIT:
  690                         error = sockopt_getint(sopt, &optval);
  691                         if (error)
  692                                 break;
  693                         switch (sopt->sopt_name) {
  694 #define OPTSET(bit) \
  695         if (optval) \
  696                 unp->unp_flags |= (bit); \
  697         else \
  698                 unp->unp_flags &= ~(bit);
  699 
  700                         case LOCAL_CREDS:
  701                                 OPTSET(UNP_WANTCRED);
  702                                 break;
  703                         case LOCAL_CONNWAIT:
  704                                 OPTSET(UNP_CONNWAIT);
  705                                 break;
  706                         }
  707                         break;
  708 #undef OPTSET
  709 
  710                 default:
  711                         error = ENOPROTOOPT;
  712                         break;
  713                 }
  714                 break;
  715 
  716         case PRCO_GETOPT:
  717                 sounlock(so);
  718                 switch (sopt->sopt_name) {
  719                 case LOCAL_PEEREID:
  720                         if (unp->unp_flags & UNP_EIDSVALID) {
  721                                 error = sockopt_set(sopt,
  722                                     &unp->unp_connid, sizeof(unp->unp_connid));
  723                         } else {
  724                                 error = EINVAL;
  725                         }
  726                         break;
  727                 case LOCAL_CREDS:
  728 #define OPTBIT(bit)     (unp->unp_flags & (bit) ? 1 : 0)
  729 
  730                         optval = OPTBIT(UNP_WANTCRED);
  731                         error = sockopt_setint(sopt, optval);
  732                         break;
  733 #undef OPTBIT
  734 
  735                 default:
  736                         error = ENOPROTOOPT;
  737                         break;
  738                 }
  739                 solock(so);
  740                 break;
  741         }
  742         return (error);
  743 }
  744 
  745 /*
  746  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
  747  * for stream sockets, although the total for sender and receiver is
  748  * actually only PIPSIZ.
  749  * Datagram sockets really use the sendspace as the maximum datagram size,
  750  * and don't really want to reserve the sendspace.  Their recvspace should
  751  * be large enough for at least one max-size datagram plus address.
  752  */
  753 #define PIPSIZ  4096
  754 u_long  unpst_sendspace = PIPSIZ;
  755 u_long  unpst_recvspace = PIPSIZ;
  756 u_long  unpdg_sendspace = 2*1024;       /* really max datagram size */
  757 u_long  unpdg_recvspace = 4*1024;
  758 
  759 u_int   unp_rights;                     /* files in flight */
  760 u_int   unp_rights_ratio = 2;           /* limit, fraction of maxfiles */
  761 
  762 int
  763 unp_attach(struct socket *so)
  764 {
  765         struct unpcb *unp;
  766         int error;
  767 
  768         switch (so->so_type) {
  769         case SOCK_STREAM:
  770                 if (so->so_lock == NULL) {
  771                         /* 
  772                          * XXX Assuming that no socket locks are held,
  773                          * as this call may sleep.
  774                          */
  775                         so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
  776                         solock(so);
  777                 }
  778                 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
  779                         error = soreserve(so, unpst_sendspace, unpst_recvspace);
  780                         if (error != 0)
  781                                 return (error);
  782                 }
  783                 break;
  784 
  785         case SOCK_DGRAM:
  786                 if (so->so_lock == NULL) {
  787                         mutex_obj_hold(uipc_lock);
  788                         so->so_lock = uipc_lock;
  789                         solock(so);
  790                 }
  791                 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
  792                         error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
  793                         if (error != 0)
  794                                 return (error);
  795                 }
  796                 break;
  797 
  798         default:
  799                 panic("unp_attach");
  800         }
  801         KASSERT(solocked(so));
  802         unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT);
  803         if (unp == NULL)
  804                 return (ENOBUFS);
  805         memset((void *)unp, 0, sizeof(*unp));
  806         unp->unp_socket = so;
  807         so->so_pcb = unp;
  808         nanotime(&unp->unp_ctime);
  809         return (0);
  810 }
  811 
  812 void
  813 unp_detach(struct unpcb *unp)
  814 {
  815         struct socket *so;
  816         vnode_t *vp;
  817 
  818         so = unp->unp_socket;
  819 
  820  retry:
  821         if ((vp = unp->unp_vnode) != NULL) {
  822                 sounlock(so);
  823                 /* Acquire v_interlock to protect against unp_connect(). */
  824                 /* XXXAD racy */
  825                 mutex_enter(&vp->v_interlock);
  826                 vp->v_socket = NULL;
  827                 vrelel(vp, 0);
  828                 solock(so);
  829                 unp->unp_vnode = NULL;
  830         }
  831         if (unp->unp_conn)
  832                 unp_disconnect(unp);
  833         while (unp->unp_refs) {
  834                 KASSERT(solocked2(so, unp->unp_refs->unp_socket));
  835                 if (unp_drop(unp->unp_refs, ECONNRESET)) {
  836                         solock(so);
  837                         goto retry;
  838                 }
  839         }
  840         soisdisconnected(so);
  841         so->so_pcb = NULL;
  842         if (unp_rights) {
  843                 /*
  844                  * Normally the receive buffer is flushed later, in sofree,
  845                  * but if our receive buffer holds references to files that
  846                  * are now garbage, we will enqueue those file references to
  847                  * the garbage collector and kick it into action.
  848                  */
  849                 sorflush(so);
  850                 unp_free(unp);
  851                 unp_thread_kick();
  852         } else
  853                 unp_free(unp);
  854 }
  855 
  856 int
  857 unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l)
  858 {
  859         struct sockaddr_un *sun;
  860         struct unpcb *unp;
  861         vnode_t *vp;
  862         struct vattr vattr;
  863         size_t addrlen;
  864         int error;
  865         struct nameidata nd;
  866         proc_t *p;
  867 
  868         unp = sotounpcb(so);
  869         if (unp->unp_vnode != NULL)
  870                 return (EINVAL);
  871         if ((unp->unp_flags & UNP_BUSY) != 0) {
  872                 /*
  873                  * EALREADY may not be strictly accurate, but since this
  874                  * is a major application error it's hardly a big deal.
  875                  */
  876                 return (EALREADY);
  877         }
  878         unp->unp_flags |= UNP_BUSY;
  879         sounlock(so);
  880 
  881         /*
  882          * Allocate the new sockaddr.  We have to allocate one
  883          * extra byte so that we can ensure that the pathname
  884          * is nul-terminated.
  885          */
  886         p = l->l_proc;
  887         addrlen = nam->m_len + 1;
  888         sun = malloc(addrlen, M_SONAME, M_WAITOK);
  889         m_copydata(nam, 0, nam->m_len, (void *)sun);
  890         *(((char *)sun) + nam->m_len) = '\0';
  891 
  892         NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, UIO_SYSSPACE,
  893             sun->sun_path);
  894 
  895 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
  896         if ((error = namei(&nd)) != 0)
  897                 goto bad;
  898         vp = nd.ni_vp;
  899         if (vp != NULL) {
  900                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
  901                 if (nd.ni_dvp == vp)
  902                         vrele(nd.ni_dvp);
  903                 else
  904                         vput(nd.ni_dvp);
  905                 vrele(vp);
  906                 error = EADDRINUSE;
  907                 goto bad;
  908         }
  909         VATTR_NULL(&vattr);
  910         vattr.va_type = VSOCK;
  911         vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
  912         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
  913         if (error)
  914                 goto bad;
  915         vp = nd.ni_vp;
  916         solock(so);
  917         vp->v_socket = unp->unp_socket;
  918         unp->unp_vnode = vp;
  919         unp->unp_addrlen = addrlen;
  920         unp->unp_addr = sun;
  921         unp->unp_connid.unp_pid = p->p_pid;
  922         unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
  923         unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
  924         unp->unp_flags |= UNP_EIDSBIND;
  925         VOP_UNLOCK(vp, 0);
  926         unp->unp_flags &= ~UNP_BUSY;
  927         return (0);
  928 
  929  bad:
  930         free(sun, M_SONAME);
  931         solock(so);
  932         unp->unp_flags &= ~UNP_BUSY;
  933         return (error);
  934 }
  935 
  936 int
  937 unp_connect(struct socket *so, struct mbuf *nam, struct lwp *l)
  938 {
  939         struct sockaddr_un *sun;
  940         vnode_t *vp;
  941         struct socket *so2, *so3;
  942         struct unpcb *unp, *unp2, *unp3;
  943         size_t addrlen;
  944         int error;
  945         struct nameidata nd;
  946 
  947         unp = sotounpcb(so);
  948         if ((unp->unp_flags & UNP_BUSY) != 0) {
  949                 /*
  950                  * EALREADY may not be strictly accurate, but since this
  951                  * is a major application error it's hardly a big deal.
  952                  */
  953                 return (EALREADY);
  954         }
  955         unp->unp_flags |= UNP_BUSY;
  956         sounlock(so);
  957 
  958         /*
  959          * Allocate a temporary sockaddr.  We have to allocate one extra
  960          * byte so that we can ensure that the pathname is nul-terminated.
  961          * When we establish the connection, we copy the other PCB's
  962          * sockaddr to our own.
  963          */
  964         addrlen = nam->m_len + 1;
  965         sun = malloc(addrlen, M_SONAME, M_WAITOK);
  966         m_copydata(nam, 0, nam->m_len, (void *)sun);
  967         *(((char *)sun) + nam->m_len) = '\0';
  968 
  969         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_SYSSPACE,
  970             sun->sun_path);
  971 
  972         if ((error = namei(&nd)) != 0)
  973                 goto bad2;
  974         vp = nd.ni_vp;
  975         if (vp->v_type != VSOCK) {
  976                 error = ENOTSOCK;
  977                 goto bad;
  978         }
  979         if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
  980                 goto bad;
  981         /* Acquire v_interlock to protect against unp_detach(). */
  982         mutex_enter(&vp->v_interlock);
  983         so2 = vp->v_socket;
  984         if (so2 == NULL) {
  985                 mutex_exit(&vp->v_interlock);
  986                 error = ECONNREFUSED;
  987                 goto bad;
  988         }
  989         if (so->so_type != so2->so_type) {
  990                 mutex_exit(&vp->v_interlock);
  991                 error = EPROTOTYPE;
  992                 goto bad;
  993         }
  994         solock(so);
  995         unp_resetlock(so);
  996         mutex_exit(&vp->v_interlock);
  997         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
  998                 /*
  999                  * This may seem somewhat fragile but is OK: if we can
 1000                  * see SO_ACCEPTCONN set on the endpoint, then it must
 1001                  * be locked by the domain-wide uipc_lock.
 1002                  */
 1003                 KASSERT((so->so_options & SO_ACCEPTCONN) == 0 ||
 1004                     so2->so_lock == uipc_lock);
 1005                 if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
 1006                     (so3 = sonewconn(so2, 0)) == 0) {
 1007                         error = ECONNREFUSED;
 1008                         sounlock(so);
 1009                         goto bad;
 1010                 }
 1011                 unp2 = sotounpcb(so2);
 1012                 unp3 = sotounpcb(so3);
 1013                 if (unp2->unp_addr) {
 1014                         unp3->unp_addr = malloc(unp2->unp_addrlen,
 1015                             M_SONAME, M_WAITOK);
 1016                         memcpy(unp3->unp_addr, unp2->unp_addr,
 1017                             unp2->unp_addrlen);
 1018                         unp3->unp_addrlen = unp2->unp_addrlen;
 1019                 }
 1020                 unp3->unp_flags = unp2->unp_flags;
 1021                 unp3->unp_connid.unp_pid = l->l_proc->p_pid;
 1022                 unp3->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
 1023                 unp3->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
 1024                 unp3->unp_flags |= UNP_EIDSVALID;
 1025                 if (unp2->unp_flags & UNP_EIDSBIND) {
 1026                         unp->unp_connid = unp2->unp_connid;
 1027                         unp->unp_flags |= UNP_EIDSVALID;
 1028                 }
 1029                 so2 = so3;
 1030         }
 1031         error = unp_connect2(so, so2, PRU_CONNECT);
 1032         sounlock(so);
 1033  bad:
 1034         vput(vp);
 1035  bad2:
 1036         free(sun, M_SONAME);
 1037         solock(so);
 1038         unp->unp_flags &= ~UNP_BUSY;
 1039         return (error);
 1040 }
 1041 
 1042 int
 1043 unp_connect2(struct socket *so, struct socket *so2, int req)
 1044 {
 1045         struct unpcb *unp = sotounpcb(so);
 1046         struct unpcb *unp2;
 1047 
 1048         if (so2->so_type != so->so_type)
 1049                 return (EPROTOTYPE);
 1050 
 1051         /*
 1052          * All three sockets involved must be locked by same lock:
 1053          *
 1054          * local endpoint (so)
 1055          * remote endpoint (so2)
 1056          * queue head (so->so_head, only if PR_CONNREQUIRED)
 1057          */
 1058         KASSERT(solocked2(so, so2));
 1059         if (so->so_head != NULL) {
 1060                 KASSERT(so->so_lock == uipc_lock);
 1061                 KASSERT(solocked2(so, so->so_head));
 1062         }
 1063 
 1064         unp2 = sotounpcb(so2);
 1065         unp->unp_conn = unp2;
 1066         switch (so->so_type) {
 1067 
 1068         case SOCK_DGRAM:
 1069                 unp->unp_nextref = unp2->unp_refs;
 1070                 unp2->unp_refs = unp;
 1071                 soisconnected(so);
 1072                 break;
 1073 
 1074         case SOCK_STREAM:
 1075                 unp2->unp_conn = unp;
 1076                 if (req == PRU_CONNECT &&
 1077                     ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
 1078                         soisconnecting(so);
 1079                 else
 1080                         soisconnected(so);
 1081                 soisconnected(so2);
 1082                 /*
 1083                  * If the connection is fully established, break the
 1084                  * association with uipc_lock and give the connected
 1085                  * pair a seperate lock to share.  For CONNECT2, we
 1086                  * require that the locks already match (the sockets
 1087                  * are created that way).
 1088                  */
 1089                 if (req == PRU_CONNECT)
 1090                         unp_setpeerlocks(so, so2);
 1091                 break;
 1092 
 1093         default:
 1094                 panic("unp_connect2");
 1095         }
 1096         return (0);
 1097 }
 1098 
 1099 void
 1100 unp_disconnect(struct unpcb *unp)
 1101 {
 1102         struct unpcb *unp2 = unp->unp_conn;
 1103         struct socket *so;
 1104 
 1105         if (unp2 == 0)
 1106                 return;
 1107         unp->unp_conn = 0;
 1108         so = unp->unp_socket;
 1109         switch (so->so_type) {
 1110         case SOCK_DGRAM:
 1111                 if (unp2->unp_refs == unp)
 1112                         unp2->unp_refs = unp->unp_nextref;
 1113                 else {
 1114                         unp2 = unp2->unp_refs;
 1115                         for (;;) {
 1116                                 KASSERT(solocked2(so, unp2->unp_socket));
 1117                                 if (unp2 == 0)
 1118                                         panic("unp_disconnect");
 1119                                 if (unp2->unp_nextref == unp)
 1120                                         break;
 1121                                 unp2 = unp2->unp_nextref;
 1122                         }
 1123                         unp2->unp_nextref = unp->unp_nextref;
 1124                 }
 1125                 unp->unp_nextref = 0;
 1126                 so->so_state &= ~SS_ISCONNECTED;
 1127                 break;
 1128 
 1129         case SOCK_STREAM:
 1130                 KASSERT(solocked2(so, unp2->unp_socket));
 1131                 soisdisconnected(so);
 1132                 unp2->unp_conn = 0;
 1133                 soisdisconnected(unp2->unp_socket);
 1134                 break;
 1135         }
 1136 }
 1137 
 1138 #ifdef notdef
 1139 unp_abort(struct unpcb *unp)
 1140 {
 1141         unp_detach(unp);
 1142 }
 1143 #endif
 1144 
 1145 void
 1146 unp_shutdown(struct unpcb *unp)
 1147 {
 1148         struct socket *so;
 1149 
 1150         if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
 1151             (so = unp->unp_conn->unp_socket))
 1152                 socantrcvmore(so);
 1153 }
 1154 
 1155 bool
 1156 unp_drop(struct unpcb *unp, int errno)
 1157 {
 1158         struct socket *so = unp->unp_socket;
 1159 
 1160         KASSERT(solocked(so));
 1161 
 1162         so->so_error = errno;
 1163         unp_disconnect(unp);
 1164         if (so->so_head) {
 1165                 so->so_pcb = NULL;
 1166                 /* sofree() drops the socket lock */
 1167                 sofree(so);
 1168                 unp_free(unp);
 1169                 return true;
 1170         }
 1171         return false;
 1172 }
 1173 
 1174 #ifdef notdef
 1175 unp_drain(void)
 1176 {
 1177 
 1178 }
 1179 #endif
 1180 
 1181 int
 1182 unp_externalize(struct mbuf *rights, struct lwp *l)
 1183 {
 1184         struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
 1185         struct proc *p = l->l_proc;
 1186         int i, *fdp;
 1187         file_t **rp;
 1188         file_t *fp;
 1189         int nfds, error = 0;
 1190 
 1191         nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
 1192             sizeof(file_t *);
 1193         rp = (file_t **)CMSG_DATA(cm);
 1194 
 1195         fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
 1196         rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
 1197 
 1198         /* Make sure the recipient should be able to see the files.. */
 1199         if (p->p_cwdi->cwdi_rdir != NULL) {
 1200                 rp = (file_t **)CMSG_DATA(cm);
 1201                 for (i = 0; i < nfds; i++) {
 1202                         fp = *rp++;
 1203                         /*
 1204                          * If we are in a chroot'ed directory, and
 1205                          * someone wants to pass us a directory, make
 1206                          * sure it's inside the subtree we're allowed
 1207                          * to access.
 1208                          */
 1209                         if (fp->f_type == DTYPE_VNODE) {
 1210                                 vnode_t *vp = (vnode_t *)fp->f_data;
 1211                                 if ((vp->v_type == VDIR) &&
 1212                                     !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
 1213                                         error = EPERM;
 1214                                         break;
 1215                                 }
 1216                         }
 1217                 }
 1218         }
 1219 
 1220  restart:
 1221         rp = (file_t **)CMSG_DATA(cm);
 1222         if (error != 0) {
 1223                 for (i = 0; i < nfds; i++) {
 1224                         fp = *rp;
 1225                         *rp++ = 0;
 1226                         unp_discard_now(fp);
 1227                 }
 1228                 goto out;
 1229         }
 1230 
 1231         /*
 1232          * First loop -- allocate file descriptor table slots for the
 1233          * new files.
 1234          */
 1235         for (i = 0; i < nfds; i++) {
 1236                 fp = *rp++;
 1237                 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
 1238                         /*
 1239                          * Back out what we've done so far.
 1240                          */
 1241                         for (--i; i >= 0; i--) {
 1242                                 fd_abort(p, NULL, fdp[i]);
 1243                         }
 1244                         if (error == ENOSPC) {
 1245                                 fd_tryexpand(p);
 1246                                 error = 0;
 1247                         } else {
 1248                                 /*
 1249                                  * This is the error that has historically
 1250                                  * been returned, and some callers may
 1251                                  * expect it.
 1252                                  */
 1253                                 error = EMSGSIZE;
 1254                         }
 1255                         goto restart;
 1256                 }
 1257         }
 1258 
 1259         /*
 1260          * Now that adding them has succeeded, update all of the
 1261          * file passing state and affix the descriptors.
 1262          */
 1263         rp = (file_t **)CMSG_DATA(cm);
 1264         for (i = 0; i < nfds; i++) {
 1265                 fp = *rp++;
 1266                 atomic_dec_uint(&unp_rights);
 1267                 fd_affix(p, fp, fdp[i]);
 1268                 mutex_enter(&fp->f_lock);
 1269                 fp->f_msgcount--;
 1270                 mutex_exit(&fp->f_lock);
 1271                 /*
 1272                  * Note that fd_affix() adds a reference to the file.
 1273                  * The file may already have been closed by another
 1274                  * LWP in the process, so we must drop the reference
 1275                  * added by unp_internalize() with closef().
 1276                  */
 1277                 closef(fp);
 1278         }
 1279 
 1280         /*
 1281          * Copy temporary array to message and adjust length, in case of
 1282          * transition from large file_t pointers to ints.
 1283          */
 1284         memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
 1285         cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
 1286         rights->m_len = CMSG_SPACE(nfds * sizeof(int));
 1287  out:
 1288         rw_exit(&p->p_cwdi->cwdi_lock);
 1289         free(fdp, M_TEMP);
 1290         return (error);
 1291 }
 1292 
 1293 int
 1294 unp_internalize(struct mbuf **controlp)
 1295 {
 1296         filedesc_t *fdescp = curlwp->l_fd;
 1297         struct mbuf *control = *controlp;
 1298         struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
 1299         file_t **rp, **files;
 1300         file_t *fp;
 1301         int i, fd, *fdp;
 1302         int nfds, error;
 1303         u_int maxmsg;
 1304 
 1305         error = 0;
 1306         newcm = NULL;
 1307 
 1308         /* Sanity check the control message header. */
 1309         if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
 1310             cm->cmsg_len > control->m_len ||
 1311             cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
 1312                 return (EINVAL);
 1313 
 1314         /*
 1315          * Verify that the file descriptors are valid, and acquire
 1316          * a reference to each.
 1317          */
 1318         nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
 1319         fdp = (int *)CMSG_DATA(cm);
 1320         maxmsg = maxfiles / unp_rights_ratio;
 1321         for (i = 0; i < nfds; i++) {
 1322                 fd = *fdp++;
 1323                 if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
 1324                         atomic_dec_uint(&unp_rights);
 1325                         nfds = i;
 1326                         error = EAGAIN;
 1327                         goto out;
 1328                 }
 1329                 if ((fp = fd_getfile(fd)) == NULL) {
 1330                         atomic_dec_uint(&unp_rights);
 1331                         nfds = i;
 1332                         error = EBADF;
 1333                         goto out;
 1334                 }
 1335         }
 1336 
 1337         /* Allocate new space and copy header into it. */
 1338         newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
 1339         if (newcm == NULL) {
 1340                 error = E2BIG;
 1341                 goto out;
 1342         }
 1343         memcpy(newcm, cm, sizeof(struct cmsghdr));
 1344         files = (file_t **)CMSG_DATA(newcm);
 1345 
 1346         /*
 1347          * Transform the file descriptors into file_t pointers, in
 1348          * reverse order so that if pointers are bigger than ints, the
 1349          * int won't get until we're done.  No need to lock, as we have
 1350          * already validated the descriptors with fd_getfile().
 1351          */
 1352         fdp = (int *)CMSG_DATA(cm) + nfds;
 1353         rp = files + nfds;
 1354         for (i = 0; i < nfds; i++) {
 1355                 fp = fdescp->fd_ofiles[*--fdp]->ff_file;
 1356                 KASSERT(fp != NULL);
 1357                 mutex_enter(&fp->f_lock);
 1358                 *--rp = fp;
 1359                 fp->f_count++;
 1360                 fp->f_msgcount++;
 1361                 mutex_exit(&fp->f_lock);
 1362         }
 1363 
 1364  out:
 1365         /* Release descriptor references. */
 1366         fdp = (int *)CMSG_DATA(cm);
 1367         for (i = 0; i < nfds; i++) {
 1368                 fd_putfile(*fdp++);
 1369                 if (error != 0) {
 1370                         atomic_dec_uint(&unp_rights);
 1371                 }
 1372         }
 1373 
 1374         if (error == 0) {
 1375                 if (control->m_flags & M_EXT) {
 1376                         m_freem(control);
 1377                         *controlp = control = m_get(M_WAIT, MT_CONTROL);
 1378                 }
 1379                 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
 1380                     M_MBUF, NULL, NULL);
 1381                 cm = newcm;
 1382                 /*
 1383                  * Adjust message & mbuf to note amount of space
 1384                  * actually used.
 1385                  */
 1386                 cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
 1387                 control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
 1388         }
 1389 
 1390         return error;
 1391 }
 1392 
 1393 struct mbuf *
 1394 unp_addsockcred(struct lwp *l, struct mbuf *control)
 1395 {
 1396         struct cmsghdr *cmp;
 1397         struct sockcred *sc;
 1398         struct mbuf *m, *n;
 1399         int len, space, i;
 1400 
 1401         len = CMSG_LEN(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
 1402         space = CMSG_SPACE(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
 1403 
 1404         m = m_get(M_WAIT, MT_CONTROL);
 1405         if (space > MLEN) {
 1406                 if (space > MCLBYTES)
 1407                         MEXTMALLOC(m, space, M_WAITOK);
 1408                 else
 1409                         m_clget(m, M_WAIT);
 1410                 if ((m->m_flags & M_EXT) == 0) {
 1411                         m_free(m);
 1412                         return (control);
 1413                 }
 1414         }
 1415 
 1416         m->m_len = space;
 1417         m->m_next = NULL;
 1418         cmp = mtod(m, struct cmsghdr *);
 1419         sc = (struct sockcred *)CMSG_DATA(cmp);
 1420         cmp->cmsg_len = len;
 1421         cmp->cmsg_level = SOL_SOCKET;
 1422         cmp->cmsg_type = SCM_CREDS;
 1423         sc->sc_uid = kauth_cred_getuid(l->l_cred);
 1424         sc->sc_euid = kauth_cred_geteuid(l->l_cred);
 1425         sc->sc_gid = kauth_cred_getgid(l->l_cred);
 1426         sc->sc_egid = kauth_cred_getegid(l->l_cred);
 1427         sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);
 1428         for (i = 0; i < sc->sc_ngroups; i++)
 1429                 sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);
 1430 
 1431         /*
 1432          * If a control message already exists, append us to the end.
 1433          */
 1434         if (control != NULL) {
 1435                 for (n = control; n->m_next != NULL; n = n->m_next)
 1436                         ;
 1437                 n->m_next = m;
 1438         } else
 1439                 control = m;
 1440 
 1441         return (control);
 1442 }
 1443 
 1444 /*
 1445  * Do a mark-sweep GC of files in the system, to free up any which are
 1446  * caught in flight to an about-to-be-closed socket.  Additionally,
 1447  * process deferred file closures.
 1448  */
 1449 static void
 1450 unp_gc(file_t *dp)
 1451 {
 1452         extern  struct domain unixdomain;
 1453         file_t *fp, *np;
 1454         struct socket *so, *so1;
 1455         u_int i, old, new;
 1456         bool didwork;
 1457 
 1458         KASSERT(curlwp == unp_thread_lwp);
 1459         KASSERT(mutex_owned(&filelist_lock));
 1460 
 1461         /*
 1462          * First, process deferred file closures.
 1463          */
 1464         while (!SLIST_EMPTY(&unp_thread_discard)) {
 1465                 fp = SLIST_FIRST(&unp_thread_discard);
 1466                 KASSERT(fp->f_unpcount > 0);
 1467                 KASSERT(fp->f_count > 0);
 1468                 KASSERT(fp->f_msgcount > 0);
 1469                 KASSERT(fp->f_count >= fp->f_unpcount);
 1470                 KASSERT(fp->f_count >= fp->f_msgcount);
 1471                 KASSERT(fp->f_msgcount >= fp->f_unpcount);
 1472                 SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
 1473                 i = fp->f_unpcount;
 1474                 fp->f_unpcount = 0;
 1475                 mutex_exit(&filelist_lock);
 1476                 for (; i != 0; i--) {
 1477                         unp_discard_now(fp);
 1478                 }
 1479                 mutex_enter(&filelist_lock);
 1480         }
 1481 
 1482         /*
 1483          * Clear mark bits.  Ensure that we don't consider new files
 1484          * entering the file table during this loop (they will not have
 1485          * FSCAN set).
 1486          */
 1487         unp_defer = 0;
 1488         LIST_FOREACH(fp, &filehead, f_list) {
 1489                 for (old = fp->f_flag;; old = new) {
 1490                         new = atomic_cas_uint(&fp->f_flag, old,
 1491                             (old | FSCAN) & ~(FMARK|FDEFER));
 1492                         if (__predict_true(old == new)) {
 1493                                 break;
 1494                         }
 1495                 }
 1496         }
 1497 
 1498         /*
 1499          * Iterate over the set of sockets, marking ones believed (based on
 1500          * refcount) to be referenced from a process, and marking for rescan
 1501          * sockets which are queued on a socket.  Recan continues descending
 1502          * and searching for sockets referenced by sockets (FDEFER), until
 1503          * there are no more socket->socket references to be discovered.
 1504          */
 1505         do {
 1506                 didwork = false;
 1507                 for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
 1508                         KASSERT(mutex_owned(&filelist_lock));
 1509                         np = LIST_NEXT(fp, f_list);
 1510                         mutex_enter(&fp->f_lock);
 1511                         if ((fp->f_flag & FDEFER) != 0) {
 1512                                 atomic_and_uint(&fp->f_flag, ~FDEFER);
 1513                                 unp_defer--;
 1514                                 KASSERT(fp->f_count != 0);
 1515                         } else {
 1516                                 if (fp->f_count == 0 ||
 1517                                     (fp->f_flag & FMARK) != 0 ||
 1518                                     fp->f_count == fp->f_msgcount ||
 1519                                     fp->f_unpcount != 0) {
 1520                                         mutex_exit(&fp->f_lock);
 1521                                         continue;
 1522                                 }
 1523                         }
 1524                         atomic_or_uint(&fp->f_flag, FMARK);
 1525 
 1526                         if (fp->f_type != DTYPE_SOCKET ||
 1527                             (so = fp->f_data) == NULL ||
 1528                             so->so_proto->pr_domain != &unixdomain ||
 1529                             (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
 1530                                 mutex_exit(&fp->f_lock);
 1531                                 continue;
 1532                         }
 1533 
 1534                         /* Gain file ref, mark our position, and unlock. */
 1535                         didwork = true;
 1536                         LIST_INSERT_AFTER(fp, dp, f_list);
 1537                         fp->f_count++;
 1538                         mutex_exit(&fp->f_lock);
 1539                         mutex_exit(&filelist_lock);
 1540 
 1541                         /*
 1542                          * Mark files referenced from sockets queued on the
 1543                          * accept queue as well.
 1544                          */
 1545                         solock(so);
 1546                         unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
 1547                         if ((so->so_options & SO_ACCEPTCONN) != 0) {
 1548                                 TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
 1549                                         unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
 1550                                 }
 1551                                 TAILQ_FOREACH(so1, &so->so_q, so_qe) {
 1552                                         unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
 1553                                 }
 1554                         }
 1555                         sounlock(so);
 1556 
 1557                         /* Re-lock and restart from where we left off. */
 1558                         closef(fp);
 1559                         mutex_enter(&filelist_lock);
 1560                         np = LIST_NEXT(dp, f_list);
 1561                         LIST_REMOVE(dp, f_list);
 1562                 }
 1563                 /*
 1564                  * Bail early if we did nothing in the loop above.  Could
 1565                  * happen because of concurrent activity causing unp_defer
 1566                  * to get out of sync.
 1567                  */
 1568         } while (unp_defer != 0 && didwork);
 1569 
 1570         /*
 1571          * Sweep pass.
 1572          *
 1573          * We grab an extra reference to each of the files that are
 1574          * not otherwise accessible and then free the rights that are
 1575          * stored in messages on them.
 1576          */
 1577         for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
 1578                 KASSERT(mutex_owned(&filelist_lock));
 1579                 np = LIST_NEXT(fp, f_list);
 1580                 mutex_enter(&fp->f_lock);
 1581 
 1582                 /*
 1583                  * Ignore non-sockets.
 1584                  * Ignore dead sockets, or sockets with pending close.
 1585                  * Ignore sockets obviously referenced elsewhere. 
 1586                  * Ignore sockets marked as referenced by our scan.
 1587                  * Ignore new sockets that did not exist during the scan.
 1588                  */
 1589                 if (fp->f_type != DTYPE_SOCKET ||
 1590                     fp->f_count == 0 || fp->f_unpcount != 0 ||
 1591                     fp->f_count != fp->f_msgcount ||
 1592                     (fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
 1593                         mutex_exit(&fp->f_lock);
 1594                         continue;
 1595                 }
 1596 
 1597                 /* Gain file ref, mark our position, and unlock. */
 1598                 LIST_INSERT_AFTER(fp, dp, f_list);
 1599                 fp->f_count++;
 1600                 mutex_exit(&fp->f_lock);
 1601                 mutex_exit(&filelist_lock);
 1602 
 1603                 /*
 1604                  * Flush all data from the socket's receive buffer.
 1605                  * This will cause files referenced only by the
 1606                  * socket to be queued for close.
 1607                  */
 1608                 so = fp->f_data;
 1609                 solock(so);
 1610                 sorflush(so);
 1611                 sounlock(so);
 1612 
 1613                 /* Re-lock and restart from where we left off. */
 1614                 closef(fp);
 1615                 mutex_enter(&filelist_lock);
 1616                 np = LIST_NEXT(dp, f_list);
 1617                 LIST_REMOVE(dp, f_list);
 1618         }
 1619 }
 1620 
 1621 /*
 1622  * Garbage collector thread.  While SCM_RIGHTS messages are in transit,
 1623  * wake once per second to garbage collect.  Run continually while we
 1624  * have deferred closes to process.
 1625  */
 1626 static void
 1627 unp_thread(void *cookie)
 1628 {
 1629         file_t *dp;
 1630 
 1631         /* Allocate a dummy file for our scans. */
 1632         if ((dp = fgetdummy()) == NULL) {
 1633                 panic("unp_thread");
 1634         }
 1635 
 1636         mutex_enter(&filelist_lock);
 1637         for (;;) {
 1638                 KASSERT(mutex_owned(&filelist_lock));
 1639                 if (SLIST_EMPTY(&unp_thread_discard)) {
 1640                         if (unp_rights != 0) {
 1641                                 (void)cv_timedwait(&unp_thread_cv,
 1642                                     &filelist_lock, hz);
 1643                         } else {
 1644                                 cv_wait(&unp_thread_cv, &filelist_lock);
 1645                         }
 1646                 }
 1647                 unp_gc(dp);
 1648         }
 1649         /* NOTREACHED */
 1650 }
 1651 
 1652 /*
 1653  * Kick the garbage collector into action if there is something for
 1654  * it to process.
 1655  */
 1656 static void
 1657 unp_thread_kick(void)
 1658 {
 1659 
 1660         if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) {
 1661                 mutex_enter(&filelist_lock);
 1662                 cv_signal(&unp_thread_cv);
 1663                 mutex_exit(&filelist_lock);
 1664         }
 1665 }
 1666 
 1667 void
 1668 unp_dispose(struct mbuf *m)
 1669 {
 1670 
 1671         if (m)
 1672                 unp_scan(m, unp_discard_later, 1);
 1673 }
 1674 
 1675 void
 1676 unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
 1677 {
 1678         struct mbuf *m;
 1679         file_t **rp, *fp;
 1680         struct cmsghdr *cm;
 1681         int i, qfds;
 1682 
 1683         while (m0) {
 1684                 for (m = m0; m; m = m->m_next) {
 1685                         if (m->m_type != MT_CONTROL ||
 1686                             m->m_len < sizeof(*cm)) {
 1687                                 continue;
 1688                         }
 1689                         cm = mtod(m, struct cmsghdr *);
 1690                         if (cm->cmsg_level != SOL_SOCKET ||
 1691                             cm->cmsg_type != SCM_RIGHTS)
 1692                                 continue;
 1693                         qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
 1694                             / sizeof(file_t *);
 1695                         rp = (file_t **)CMSG_DATA(cm);
 1696                         for (i = 0; i < qfds; i++) {
 1697                                 fp = *rp;
 1698                                 if (discard) {
 1699                                         *rp = 0;
 1700                                 }
 1701                                 (*op)(fp);
 1702                                 rp++;
 1703                         }
 1704                 }
 1705                 m0 = m0->m_nextpkt;
 1706         }
 1707 }
 1708 
 1709 void
 1710 unp_mark(file_t *fp)
 1711 {
 1712 
 1713         if (fp == NULL)
 1714                 return;
 1715 
 1716         /* If we're already deferred, don't screw up the defer count */
 1717         mutex_enter(&fp->f_lock);
 1718         if (fp->f_flag & (FMARK | FDEFER)) {
 1719                 mutex_exit(&fp->f_lock);
 1720                 return;
 1721         }
 1722 
 1723         /*
 1724          * Minimize the number of deferrals...  Sockets are the only type of
 1725          * file which can hold references to another file, so just mark
 1726          * other files, and defer unmarked sockets for the next pass.
 1727          */
 1728         if (fp->f_type == DTYPE_SOCKET) {
 1729                 unp_defer++;
 1730                 KASSERT(fp->f_count != 0);
 1731                 atomic_or_uint(&fp->f_flag, FDEFER);
 1732         } else {
 1733                 atomic_or_uint(&fp->f_flag, FMARK);
 1734         }
 1735         mutex_exit(&fp->f_lock);
 1736 }
 1737 
 1738 static void
 1739 unp_discard_now(file_t *fp)
 1740 {
 1741 
 1742         if (fp == NULL)
 1743                 return;
 1744 
 1745         KASSERT(fp->f_count > 0);
 1746         KASSERT(fp->f_msgcount > 0);
 1747 
 1748         mutex_enter(&fp->f_lock);
 1749         fp->f_msgcount--;
 1750         mutex_exit(&fp->f_lock);
 1751         atomic_dec_uint(&unp_rights);
 1752         (void)closef(fp);
 1753 }
 1754 
 1755 static void
 1756 unp_discard_later(file_t *fp)
 1757 {
 1758 
 1759         if (fp == NULL)
 1760                 return;
 1761 
 1762         KASSERT(fp->f_count > 0);
 1763         KASSERT(fp->f_msgcount > 0);
 1764 
 1765         mutex_enter(&filelist_lock);
 1766         if (fp->f_unpcount++ == 0) {
 1767                 SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
 1768         }
 1769         mutex_exit(&filelist_lock);
 1770 }

Cache object: afe82cea80fe9bf18ca94b617f64b990


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.