The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/bsd/kern/kern_aio.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_LICENSE_HEADER_START@
    5  * 
    6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
    7  * 
    8  * This file contains Original Code and/or Modifications of Original Code
    9  * as defined in and that are subject to the Apple Public Source License
   10  * Version 2.0 (the 'License'). You may not use this file except in
   11  * compliance with the License. Please obtain a copy of the License at
   12  * http://www.opensource.apple.com/apsl/ and read it before using this
   13  * file.
   14  * 
   15  * The Original Code and all software distributed under the License are
   16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   20  * Please see the License for the specific language governing rights and
   21  * limitations under the License.
   22  * 
   23  * @APPLE_LICENSE_HEADER_END@
   24  */
   25 
   26 
   27 /*
   28  * todo:
   29  *              1) ramesh is looking into how to replace taking a reference on
   30  *                      the user's map (vm_map_reference()) since it is believed that 
   31  *                      would not hold the process for us.
   32  *              2) david is looking into a way for us to set the priority of the
   33  *                      worker threads to match that of the user's thread when the 
   34  *                      async IO was queued.
   35  */
   36 
   37 
   38 /*
   39  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
   40  */
   41 
   42 #include <sys/systm.h>
   43 #include <sys/buf.h>
   44 #include <sys/fcntl.h>
   45 #include <sys/file.h>
   46 #include <sys/filedesc.h>
   47 #include <sys/kernel.h>
   48 #include <sys/vnode.h>
   49 #include <sys/malloc.h>
   50 #include <sys/mount.h>
   51 #include <sys/param.h>
   52 #include <sys/proc.h>
   53 #include <sys/sysctl.h>
   54 #include <sys/unistd.h>
   55 #include <sys/user.h>
   56 
   57 #include <sys/aio_kern.h>
   58 
   59 #include <machine/limits.h>
   60 #include <kern/zalloc.h>
   61 #include <kern/task.h>
   62 
   63 #include <sys/kdebug.h>
   64 #define AIO_work_queued                                 1
   65 #define AIO_worker_wake                                 2
   66 #define AIO_completion_sig                              3
   67 #define AIO_completion_cleanup_wait             4
   68 #define AIO_completion_cleanup_wake             5
   69 #define AIO_completion_suspend_wake     6
   70 #define AIO_fsync_delay                                 7
   71 #define AIO_cancel                                              10
   72 #define AIO_cancel_async_workq                  11
   73 #define AIO_cancel_sync_workq                   12
   74 #define AIO_cancel_activeq                              13
   75 #define AIO_cancel_doneq                                14
   76 #define AIO_fsync                                               20
   77 #define AIO_read                                                30
   78 #define AIO_write                                               40
   79 #define AIO_listio                                              50
   80 #define AIO_error                                               60
   81 #define AIO_error_val                                   61
   82 #define AIO_error_activeq                               62
   83 #define AIO_error_workq                                 63
   84 #define AIO_return                                              70
   85 #define AIO_return_val                                  71
   86 #define AIO_return_activeq                              72
   87 #define AIO_return_workq                                73
   88 #define AIO_exec                                                80
   89 #define AIO_exit                                                90
   90 #define AIO_exit_sleep                                  91
   91 #define AIO_close                                               100
   92 #define AIO_close_sleep                                 101
   93 #define AIO_suspend                                             110
   94 #define AIO_suspend_sleep                               111
   95 #define AIO_worker_thread                               120
   96 
   97 #if 0
   98 #undef KERNEL_DEBUG
   99 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
  100 #endif
  101 
  102 /* 
  103  * aio requests queue up on the aio_async_workq or lio_sync_workq (for 
  104  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq 
  105  * (proc.aio_activeq) when one of our worker threads start the IO. 
  106  * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
  107  * when the IO request completes.  The request remains on aio_doneq until 
  108  * user process calls aio_return or the process exits, either way that is our 
  109  * trigger to release aio resources. 
  110  */
  111 struct aio_anchor_cb
  112 {
  113         int                                                                     aio_async_workq_count;  /* entries on aio_async_workq */
  114         int                                                                     lio_sync_workq_count;   /* entries on lio_sync_workq */
  115         int                                                                     aio_active_count;       /* entries on all active queues (proc.aio_activeq) */
  116         int                                                                     aio_done_count;         /* entries on all done queues (proc.aio_doneq) */
  117         TAILQ_HEAD( , aio_workq_entry )         aio_async_workq;
  118         TAILQ_HEAD( , aio_workq_entry )         lio_sync_workq;
  119 };
  120 typedef struct aio_anchor_cb aio_anchor_cb;
  121 
  122 
  123 /*
  124  * Notes on aio sleep / wake channels.
  125  * We currently pick a couple fields within the proc structure that will allow
  126  * us sleep channels that currently do not collide with any other kernel routines.
  127  * At this time, for binary compatibility reasons, we cannot create new proc fields.
  128  */
  129 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu
  130 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu
  131 
  132 
  133 /*
  134  * aysnc IO locking macros used to protect critical sections.
  135  */
  136 #define AIO_LOCK        usimple_lock( &aio_lock )
  137 #define AIO_UNLOCK      usimple_unlock( &aio_lock )
  138 
  139 
  140 /*
  141  *  LOCAL PROTOTYPES
  142  */
  143 static int                      aio_active_requests_for_process( struct proc *procp );
  144 static boolean_t        aio_delay_fsync_request( aio_workq_entry *entryp );
  145 static int                      aio_free_request( aio_workq_entry *entryp, vm_map_t the_map );
  146 static int                      aio_get_all_queues_count( void );
  147 static int                      aio_get_process_count( struct proc *procp );
  148 static aio_workq_entry *  aio_get_some_work( void );
  149 static boolean_t        aio_last_group_io( aio_workq_entry *entryp );
  150 static void                     aio_mark_requests( aio_workq_entry *entryp );
  151 static int                      aio_queue_async_request( struct proc *procp, 
  152                                                                                          struct aiocb *aiocbp,
  153                                                                                          int kindOfIO );
  154 static int                      aio_validate( aio_workq_entry *entryp );
  155 static void                     aio_work_thread( void );
  156 static int                      do_aio_cancel(  struct proc *p, 
  157                                                                         int fd, 
  158                                                                         struct aiocb *aiocbp, 
  159                                                                         boolean_t wait_for_completion,
  160                                                                         boolean_t disable_notification );
  161 static void                     do_aio_completion( aio_workq_entry *entryp );
  162 static int                      do_aio_fsync( aio_workq_entry *entryp );
  163 static int                      do_aio_read( aio_workq_entry *entryp );
  164 static int                      do_aio_write( aio_workq_entry *entryp );
  165 static boolean_t        is_already_queued(      struct proc *procp, 
  166                                                                                 struct aiocb *aiocbp );
  167 static int                      lio_create_async_entry( struct proc *procp, 
  168                                                                                          struct aiocb *aiocbp, 
  169                                                                                          struct sigevent *sigp, 
  170                                                                                          long group_tag,
  171                                                                                          aio_workq_entry **entrypp );
  172 static int                      lio_create_sync_entry( struct proc *procp, 
  173                                                                                         struct aiocb *aiocbp, 
  174                                                                                         long group_tag,
  175                                                                                         aio_workq_entry **entrypp );
  176 
  177 /*
  178  *  EXTERNAL PROTOTYPES
  179  */
  180 
  181 /* in ...bsd/kern/sys_generic.c */
  182 extern struct file*     holdfp( struct filedesc* fdp, int fd, int flag );
  183 extern int                      dofileread( struct proc *p, struct file *fp, int fd, 
  184                                                                 void *buf, size_t nbyte, off_t offset, 
  185                                                                 int flags, int *retval );
  186 extern int                      dofilewrite( struct proc *p, struct file *fp, int fd, 
  187                                                                  const void *buf, size_t nbyte, off_t offset, 
  188                                                                  int flags, int *retval );
  189 extern vm_map_t         vm_map_switch( vm_map_t    map );
  190 
  191 
  192 /*
  193  * aio external global variables.
  194  */
  195 extern int aio_max_requests;                            /* AIO_MAX - configurable */
  196 extern int aio_max_requests_per_process;        /* AIO_PROCESS_MAX - configurable */
  197 extern int aio_worker_threads;                          /* AIO_THREAD_COUNT - configurable */
  198 
  199 
  200 /*
  201  * aio static variables.
  202  */
  203 static aio_anchor_cb            aio_anchor;
  204 static simple_lock_data_t       aio_lock;
  205 static struct zone              *aio_workq_zonep;
  206 
  207 
  208 /*
  209  * syscall input parameters
  210  */
  211 #ifndef _SYS_SYSPROTO_H_
  212 
  213 struct  aio_cancel_args {
  214         int                             fd;     
  215         struct aiocb    *aiocbp;        
  216 };
  217 
  218 struct  aio_error_args {
  219         struct aiocb                    *aiocbp;        
  220 };
  221 
  222 struct  aio_fsync_args {
  223         int                                             op;     
  224         struct aiocb                    *aiocbp;        
  225 };
  226 
  227 struct  aio_read_args {
  228         struct aiocb                    *aiocbp;        
  229 };
  230 
  231 struct  aio_return_args {
  232         struct aiocb    *aiocbp;        
  233 };
  234 
  235 struct  aio_suspend_args {
  236         struct aiocb *const     *aiocblist;     
  237         int                                             nent;   
  238         const struct timespec   *timeoutp;      
  239 };
  240 
  241 struct  aio_write_args {
  242         struct aiocb                    *aiocbp;        
  243 };
  244 
  245 struct  lio_listio_args {
  246         int                                             mode;   
  247         struct aiocb *const     *aiocblist;     
  248         int                                             nent;   
  249         struct sigevent                 *sigp;  
  250 };
  251 
  252 #endif /* _SYS_SYSPROTO_H_ */
  253 
  254 
  255 /*
  256  * aio_cancel - attempt to cancel one or more async IO requests currently
  257  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not 
  258  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
  259  * is NULL then all outstanding async IO request for the given file
  260  * descriptor are cancelled (if possible).
  261  */
  262 
  263 int
  264 aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval )
  265 {
  266         struct aiocb                            my_aiocb;
  267         int                                                     result;
  268         boolean_t                                       funnel_state;
  269 
  270         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
  271                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
  272 
  273         /* quick check to see if there are any async IO requests queued up */
  274         AIO_LOCK;
  275         result = aio_get_all_queues_count( );
  276         AIO_UNLOCK;
  277         if ( result < 1 ) {
  278                 result = EBADF;
  279                 goto ExitRoutine;
  280         }
  281         
  282         *retval = -1; 
  283         if ( uap->aiocbp != NULL ) {
  284                 result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) );
  285                 if ( result != 0 ) {
  286                         result = EAGAIN; 
  287                         goto ExitRoutine;
  288                 }
  289 
  290                 /* NOTE - POSIX standard says a mismatch between the file */
  291                 /* descriptor passed in and the file descriptor embedded in */
  292                 /* the aiocb causes unspecified results.  We return EBADF in */
  293                 /* that situation.  */
  294                 if ( uap->fd != my_aiocb.aio_fildes ) {
  295                         result = EBADF;
  296                         goto ExitRoutine;
  297                 }
  298         }
  299 
  300         /* current BSD code assumes funnel lock is held */
  301         funnel_state = thread_funnel_set( kernel_flock, TRUE );
  302         result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE );
  303         (void) thread_funnel_set( kernel_flock, funnel_state );
  304 
  305         if ( result != -1 ) {
  306                 *retval = result;
  307                 result = 0;
  308                 goto ExitRoutine;
  309         }
  310         
  311         result = EBADF;
  312         
  313 ExitRoutine:
  314         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
  315                           (int)p, (int)uap->aiocbp, result, 0, 0 );
  316 
  317         return( result );
  318 
  319 } /* aio_cancel */
  320 
  321 
  322 /*
  323  * _aio_close - internal function used to clean up async IO requests for 
  324  * a file descriptor that is closing.  
  325  * NOTE - kernel funnel lock is held when we get called. 
  326  * THIS MAY BLOCK.
  327  */
  328 
  329 __private_extern__ void
  330 _aio_close( struct proc *p, int fd )
  331 {
  332         int                     error, count;
  333 
  334         /* quick check to see if there are any async IO requests queued up */
  335         AIO_LOCK;
  336         count = aio_get_all_queues_count( );
  337         AIO_UNLOCK;
  338         if ( count < 1 )
  339                 return;
  340 
  341         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
  342                           (int)p, fd, 0, 0, 0 );
  343         
  344         /* cancel all async IO requests on our todo queues for this file descriptor */
  345         error = do_aio_cancel( p, fd, NULL, TRUE, FALSE );
  346         if ( error == AIO_NOTCANCELED ) {
  347                 /* 
  348                  * AIO_NOTCANCELED is returned when we find an aio request for this process 
  349                  * and file descriptor on the active async IO queue.  Active requests cannot 
  350                  * be cancelled so we must wait for them to complete.  We will get a special 
  351                  * wake up call on our channel used to sleep for ALL active requests to 
  352                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used  
  353                  * when we must wait for all active aio requests.  
  354                  */
  355 
  356                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
  357                                   (int)p, fd, 0, 0, 0 );
  358 
  359                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 );
  360         }
  361 
  362         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
  363                           (int)p, fd, 0, 0, 0 );
  364 
  365         return;
  366         
  367 } /* _aio_close */
  368 
  369 
  370 /*
  371  * aio_error - return the error status associated with the async IO
  372  * request referred to by uap->aiocbp.  The error status is the errno
  373  * value that would be set by the corresponding IO request (read, wrtie,
  374  * fdatasync, or sync).
  375  */
  376 
  377 int
  378 aio_error( struct proc *p, struct aio_error_args *uap, int *retval )
  379 {
  380         aio_workq_entry                         *entryp;
  381         int                                                     error;
  382 
  383         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
  384                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
  385 
  386         AIO_LOCK;
  387 
  388         /* quick check to see if there are any async IO requests queued up */
  389         if ( aio_get_all_queues_count( ) < 1 ) {
  390                 error = EINVAL;
  391                 goto ExitRoutine;
  392         }
  393         
  394         /* look for a match on our queue of async IO requests that have completed */
  395         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
  396                 if ( entryp->uaiocbp == uap->aiocbp ) {
  397                         *retval = entryp->errorval;
  398                         error = 0;
  399                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
  400                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
  401                         goto ExitRoutine;
  402                 }
  403         }
  404         
  405         /* look for a match on our queue of active async IO requests */
  406         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
  407                 if ( entryp->uaiocbp == uap->aiocbp ) {
  408                         *retval = EINPROGRESS;
  409                         error = 0;
  410                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
  411                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
  412                         goto ExitRoutine;
  413                 }
  414         }
  415         
  416         /* look for a match on our queue of todo work */
  417         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
  418                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
  419                         *retval = EINPROGRESS;
  420                         error = 0;
  421                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE,
  422                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
  423                         goto ExitRoutine;
  424                 }
  425         }
  426         error = EINVAL;
  427         
  428 ExitRoutine:
  429         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
  430                           (int)p, (int)uap->aiocbp, error, 0, 0 );
  431         AIO_UNLOCK;
  432 
  433         return( error );
  434 
  435 } /* aio_error */
  436 
  437 
  438 /*
  439  * aio_fsync - asynchronously force all IO operations associated 
  440  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and 
  441  * queued at the time of the call to the synchronized completion state.
  442  * NOTE - we do not support op O_DSYNC at this point since we do not support the 
  443  * fdatasync() call.
  444  */
  445 
  446 int
  447 aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval )
  448 {
  449         int                     error;
  450         int                     fsync_kind;
  451 
  452         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
  453                           (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
  454 
  455         *retval = 0;
  456         if ( uap->op == O_SYNC )
  457                 fsync_kind = AIO_FSYNC;
  458 #if 0 // we don't support fdatasync() call yet
  459         else if ( uap->op == O_DSYNC )
  460                 fsync_kind = AIO_DSYNC;
  461 #endif
  462         else {
  463                 *retval = -1;
  464                 error = EINVAL;
  465                 goto ExitRoutine;
  466         }
  467         
  468         error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
  469         if ( error != 0 )
  470                 *retval = -1;
  471 
  472 ExitRoutine:            
  473         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
  474                           (int)p, (int)uap->aiocbp, error, 0, 0 );
  475 
  476         return( error );
  477 
  478 } /* aio_fsync */
  479 
  480 
  481 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the 
  482  * file descriptor (uap->aiocbp->aio_fildes) into the buffer 
  483  * (uap->aiocbp->aio_buf).
  484  */
  485 
  486 int
  487 aio_read( struct proc *p, struct aio_read_args *uap, int *retval )
  488 {
  489         int                     error;
  490 
  491         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
  492                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
  493         
  494         *retval = 0;
  495 
  496         error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
  497         if ( error != 0 )
  498                 *retval = -1;
  499 
  500         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
  501                           (int)p, (int)uap->aiocbp, error, 0, 0 );
  502                 
  503         return( error );
  504 
  505 } /* aio_read */
  506 
  507 
  508 /*
  509  * aio_return - return the return status associated with the async IO
  510  * request referred to by uap->aiocbp.  The return status is the value
  511  * that would be returned by corresponding IO request (read, wrtie,
  512  * fdatasync, or sync).  This is where we release kernel resources 
  513  * held for async IO call associated with the given aiocb pointer.
  514  */
  515 
  516 int
  517 aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval )
  518 {
  519         aio_workq_entry                         *entryp;
  520         int                                                     error;
  521         boolean_t                                       lock_held;
  522         
  523         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
  524                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
  525 
  526         AIO_LOCK;
  527         lock_held = TRUE;
  528         *retval = 0;
  529         
  530         /* quick check to see if there are any async IO requests queued up */
  531         if ( aio_get_all_queues_count( ) < 1 ) {
  532                 error = EINVAL;
  533                 goto ExitRoutine;
  534         }
  535 
  536         /* look for a match on our queue of async IO requests that have completed */
  537         TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
  538                 if ( entryp->uaiocbp == uap->aiocbp ) {
  539                         TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
  540                         aio_anchor.aio_done_count--;
  541                         p->aio_done_count--;
  542                         
  543                         *retval = entryp->returnval;
  544 
  545                         /* we cannot free requests that are still completing */
  546                         if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
  547                                 vm_map_t                my_map;
  548                         
  549                                 my_map = entryp->aio_map;
  550                                 entryp->aio_map = VM_MAP_NULL;
  551                                 AIO_UNLOCK;
  552                                 lock_held = FALSE;
  553                                 aio_free_request( entryp, my_map );
  554                         }
  555                         else
  556                                 /* tell completion code to free this request */
  557                                 entryp->flags |= AIO_DO_FREE;
  558                         error = 0;
  559                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
  560                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
  561                         goto ExitRoutine;
  562                 }
  563         }
  564         
  565         /* look for a match on our queue of active async IO requests */
  566         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
  567                 if ( entryp->uaiocbp == uap->aiocbp ) {
  568                         error = EINPROGRESS;
  569                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
  570                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
  571                         goto ExitRoutine;
  572                 }
  573         }
  574         
  575         /* look for a match on our queue of todo work */
  576         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
  577                 if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) {
  578                         error = EINPROGRESS;
  579                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE,
  580                                            (int)p, (int)uap->aiocbp, *retval, 0, 0 );
  581                         goto ExitRoutine;
  582                 }
  583         }
  584         error = EINVAL;
  585         
  586 ExitRoutine:
  587         if ( lock_held )
  588                 AIO_UNLOCK;
  589         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
  590                           (int)p, (int)uap->aiocbp, error, 0, 0 );
  591 
  592         return( error );
  593 
  594 } /* aio_return */
  595 
  596 
  597 /*
  598  * _aio_exec - internal function used to clean up async IO requests for 
  599  * a process that is going away due to exec().  We cancel any async IOs   
  600  * we can and wait for those already active.  We also disable signaling
  601  * for cancelled or active aio requests that complete. 
  602  * NOTE - kernel funnel lock is held when we get called. 
  603  * This routine MAY block!
  604  */
  605 
  606 __private_extern__ void
  607 _aio_exec( struct proc *p )
  608 {
  609 
  610         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
  611                           (int)p, 0, 0, 0, 0 );
  612 
  613         _aio_exit( p );
  614 
  615         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
  616                           (int)p, 0, 0, 0, 0 );
  617 
  618         return;
  619                 
  620 } /* _aio_exec */
  621 
  622 
  623 /*
  624  * _aio_exit - internal function used to clean up async IO requests for 
  625  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs   
  626  * we can and wait for those already active.  We also disable signaling
  627  * for cancelled or active aio requests that complete.  This routine MAY block!
  628  * NOTE - kernel funnel lock is held when we get called. 
  629  */
  630 
  631 __private_extern__ void
  632 _aio_exit( struct proc *p )
  633 {
  634         int                                             error, count;
  635         aio_workq_entry                 *entryp;
  636 
  637         /* quick check to see if there are any async IO requests queued up */
  638         AIO_LOCK;
  639         count = aio_get_all_queues_count( );
  640         AIO_UNLOCK;
  641         if ( count < 1 ) {
  642                 return;
  643         }
  644 
  645         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
  646                           (int)p, 0, 0, 0, 0 );
  647 
  648         /* 
  649          * cancel async IO requests on the todo work queue and wait for those  
  650          * already active to complete. 
  651          */
  652         error = do_aio_cancel( p, 0, NULL, TRUE, TRUE );
  653         if ( error == AIO_NOTCANCELED ) {
  654                 /* 
  655                  * AIO_NOTCANCELED is returned when we find an aio request for this process 
  656                  * on the active async IO queue.  Active requests cannot be cancelled so we 
  657                  * must wait for them to complete.  We will get a special wake up call on 
  658                  * our channel used to sleep for ALL active requests to complete.  This sleep 
  659                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all 
  660                  * active aio requests.  
  661                  */
  662 
  663                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
  664                                   (int)p, 0, 0, 0, 0 );
  665 
  666                 tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 );
  667         }
  668         
  669         /* release all aio resources used by this process */
  670         AIO_LOCK;
  671         entryp = TAILQ_FIRST( &p->aio_doneq );
  672         while ( entryp != NULL ) {
  673                 aio_workq_entry                 *next_entryp;
  674                         
  675                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
  676                 TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link );
  677                 aio_anchor.aio_done_count--;
  678                 p->aio_done_count--;
  679                         
  680                 /* we cannot free requests that are still completing */
  681                 if ( (entryp->flags & AIO_COMPLETION) == 0 ) {
  682                         vm_map_t                my_map;
  683                         
  684                         my_map = entryp->aio_map;
  685                         entryp->aio_map = VM_MAP_NULL;
  686                         AIO_UNLOCK;
  687                         aio_free_request( entryp, my_map );
  688 
  689                         /* need to start over since aio_doneq may have been */
  690                         /* changed while we were away.  */
  691                         AIO_LOCK;
  692                         entryp = TAILQ_FIRST( &p->aio_doneq );
  693                         continue;
  694                 }
  695                 else
  696                         /* tell completion code to free this request */
  697                         entryp->flags |= AIO_DO_FREE;
  698                 entryp = next_entryp;
  699         }
  700         AIO_UNLOCK;
  701 
  702 ExitRoutine:
  703         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
  704                           (int)p, 0, 0, 0, 0 );
  705 
  706         return;
  707         
  708 } /* _aio_exit */
  709 
  710 
  711 /*
  712  * do_aio_cancel - cancel async IO requests (if possible).  We get called by
  713  * aio_cancel, close, and at exit.  
  714  * There are three modes of operation: 1) cancel all async IOs for a process - 
  715  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd 
  716  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
  717  * aiocbp.
  718  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all 
  719  * target async IO requests, AIO_NOTCANCELED if we could not cancel all 
  720  * target async IO requests, and AIO_ALLDONE if all target async IO requests 
  721  * were already complete.
  722  * WARNING - do not deference aiocbp in this routine, it may point to user 
  723  * land data that has not been copied in (when called from aio_cancel() )
  724  * NOTE - kernel funnel lock is held when we get called. 
  725  */
  726 
  727 static int
  728 do_aio_cancel(  struct proc *p, int fd, struct aiocb *aiocbp, 
  729                                 boolean_t wait_for_completion, boolean_t disable_notification )
  730 {
  731         aio_workq_entry                 *entryp;
  732         int                                             result;
  733 
  734         result = -1;
  735                 
  736         /* look for a match on our queue of async todo work. */
  737         AIO_LOCK;
  738         entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
  739         while ( entryp != NULL ) {
  740                 aio_workq_entry                 *next_entryp;
  741                 
  742                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
  743                 if ( p == entryp->procp ) {
  744                         if ( (aiocbp == NULL && fd == 0) ||
  745                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
  746                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
  747                                 /* we found a match so we remove the entry from the */
  748                                 /* todo work queue and place it on the done queue */
  749                                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
  750                                 aio_anchor.aio_async_workq_count--;
  751                                 entryp->errorval = ECANCELED;
  752                                 entryp->returnval = -1;
  753                                 if ( disable_notification )
  754                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
  755                                 result = AIO_CANCELED;
  756 
  757                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
  758                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
  759 
  760                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
  761                                 aio_anchor.aio_done_count++;
  762                                 p->aio_done_count++;
  763                                 entryp->flags |= AIO_COMPLETION;
  764                                 AIO_UNLOCK;
  765                                 
  766                                 /* do completion processing for this request */
  767                                 do_aio_completion( entryp );
  768                         
  769                                 AIO_LOCK;
  770                                 entryp->flags &= ~AIO_COMPLETION;
  771                                 if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
  772                                         vm_map_t                my_map;
  773                                         
  774                                         my_map = entryp->aio_map;
  775                                         entryp->aio_map = VM_MAP_NULL;
  776                                         AIO_UNLOCK;
  777                                         aio_free_request( entryp, my_map );
  778                                 }
  779                                 else
  780                                         AIO_UNLOCK;
  781 
  782                                 if ( aiocbp != NULL ) {
  783                                         return( result );
  784                                 }
  785                                 
  786                                 /* need to start over since aio_async_workq may have been */
  787                                 /* changed while we were away doing completion processing.  */
  788                                 AIO_LOCK;
  789                                 entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
  790                                 continue;
  791                         }
  792                 }
  793                 entryp = next_entryp;
  794         } /* while... */
  795                 
  796         /* 
  797          * look for a match on our queue of synchronous todo work.  This will 
  798          * be a rare occurrence but could happen if a process is terminated while 
  799          * processing a lio_listio call. 
  800          */
  801         entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
  802         while ( entryp != NULL ) {
  803                 aio_workq_entry                 *next_entryp;
  804                 
  805                 next_entryp = TAILQ_NEXT( entryp, aio_workq_link );
  806                 if ( p == entryp->procp ) {
  807                         if ( (aiocbp == NULL && fd == 0) ||
  808                                  (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
  809                                  (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
  810                                 /* we found a match so we remove the entry from the */
  811                                 /* todo work queue and place it on the done queue */
  812                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
  813                                 aio_anchor.lio_sync_workq_count--;
  814                                 entryp->errorval = ECANCELED;
  815                                 entryp->returnval = -1;
  816                                 if ( disable_notification )
  817                                         entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
  818                                 result = AIO_CANCELED;
  819 
  820                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE,
  821                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
  822 
  823                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
  824                                 aio_anchor.aio_done_count++;
  825                                 p->aio_done_count++;
  826                                 if ( aiocbp != NULL ) {
  827                                         AIO_UNLOCK;
  828                                         return( result );
  829                                 }
  830                         }
  831                 }
  832                 entryp = next_entryp;
  833         } /* while... */
  834 
  835         /* 
  836          * look for a match on our queue of active async IO requests and 
  837          * return AIO_NOTCANCELED result. 
  838          */
  839         TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) {
  840                 if ( (aiocbp == NULL && fd == 0) ||
  841                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
  842                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
  843                         result = AIO_NOTCANCELED;
  844 
  845                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
  846                                                   (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
  847 
  848                         if ( wait_for_completion )
  849                                 entryp->flags |= AIO_WAITING; /* flag for special completion processing */
  850                         if ( disable_notification )
  851                                 entryp->flags |= AIO_DISABLE; /* flag for special completion processing */
  852                         if ( aiocbp != NULL ) {
  853                                 AIO_UNLOCK;
  854                                 return( result );
  855                         }
  856                 }
  857         }
  858         
  859         /* 
  860          * if we didn't find any matches on the todo or active queues then look for a 
  861          * match on our queue of async IO requests that have completed and if found 
  862          * return AIO_ALLDONE result.  
  863          */
  864         if ( result == -1 ) {
  865                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
  866                 if ( (aiocbp == NULL && fd == 0) ||
  867                          (aiocbp != NULL && entryp->uaiocbp == aiocbp) ||
  868                          (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) {
  869                                 result = AIO_ALLDONE;
  870 
  871                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
  872                                                           (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
  873 
  874                                 if ( aiocbp != NULL ) {
  875                                         AIO_UNLOCK;
  876                                         return( result );
  877                                 }
  878                         }
  879                 }
  880         }
  881         AIO_UNLOCK;
  882 
  883         return( result );
  884         
  885 } /* do_aio_cancel */
  886 
  887 
  888 /*
  889  * aio_suspend - suspend the calling thread until at least one of the async
  890  * IO operations referenced by uap->aiocblist has completed, until a signal
  891  * interrupts the function, or uap->timeoutp time interval (optional) has
  892  * passed.
  893  * Returns 0 if one or more async IOs have completed else -1 and errno is
  894  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
  895  * woke us up.
  896  */
  897 
  898 int
  899 aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval )
  900 {
  901         int                                     error;
  902         int                                     i, count;
  903         uint64_t                        abstime;
  904         struct timespec         ts;
  905         struct timeval          tv;
  906         aio_workq_entry         *entryp;
  907         struct aiocb *          *aiocbpp;
  908         
  909         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
  910                           (int)p, uap->nent, 0, 0, 0 );
  911 
  912         *retval = -1;
  913         abstime = 0;
  914         aiocbpp = NULL;
  915 
  916         /* quick check to see if there are any async IO requests queued up */
  917         AIO_LOCK;
  918         count = aio_get_all_queues_count( );
  919         AIO_UNLOCK;
  920         if ( count < 1 ) {
  921                 error = EINVAL;
  922                 goto ExitThisRoutine;
  923         }
  924 
  925         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
  926                 error = EINVAL;
  927                 goto ExitThisRoutine;
  928         }
  929 
  930         if ( uap->timeoutp != NULL ) {
  931                 error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) );
  932                 if ( error != 0 ) {
  933                         error = EAGAIN;
  934                         goto ExitThisRoutine;
  935                 }
  936                         
  937                 if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
  938                         error = EINVAL;
  939                         goto ExitThisRoutine;
  940                 }
  941 
  942                 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, 
  943                                                                          &abstime );
  944                 clock_absolutetime_interval_to_deadline( abstime, &abstime );
  945         }
  946 
  947         MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
  948         if ( aiocbpp == NULL ) {
  949                 error = EAGAIN;
  950                 goto ExitThisRoutine;
  951         }
  952 
  953         /* check list of aio requests to see if any have completed */
  954         for ( i = 0; i < uap->nent; i++ ) {
  955                 struct aiocb    *aiocbp;
  956         
  957                 /* copyin in aiocb pointer from list */
  958                 error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(aiocbp) );
  959                 if ( error != 0 ) {
  960                         error = EAGAIN;
  961                         goto ExitThisRoutine;
  962                 }
  963         
  964                 /* NULL elements are legal so check for 'em */
  965                 aiocbp = *(aiocbpp + i);
  966                 if ( aiocbp == NULL )
  967                         continue;
  968 
  969                 /* return immediately if any aio request in the list is done */
  970                 AIO_LOCK;
  971                 TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) {
  972                         if ( entryp->uaiocbp == aiocbp ) {
  973                                 *retval = 0;
  974                                 error = 0;
  975                                 AIO_UNLOCK;
  976                                 goto ExitThisRoutine;
  977                         }
  978                 }
  979                 AIO_UNLOCK;
  980         } /* for ( ; i < uap->nent; ) */
  981 
  982         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
  983                           (int)p, uap->nent, 0, 0, 0 );
  984         
  985         /* 
  986          * wait for an async IO to complete or a signal fires or timeout expires. 
  987          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal 
  988          * interrupts us.  If an async IO completes before a signal fires or our 
  989          * timeout expires, we get a wakeup call from aio_work_thread().  We do not
  990          * use tsleep() here in order to avoid getting kernel funnel lock.
  991          */
  992         assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE );
  993         if ( abstime > 0 ) {
  994                 thread_set_timer_deadline( abstime );
  995         }
  996         error = thread_block( THREAD_CONTINUE_NULL );
  997         if ( error == THREAD_AWAKENED ) {
  998                 /* got our wakeup call from aio_work_thread() */
  999                 if ( abstime > 0 ) {
 1000                         thread_cancel_timer();
 1001                 }
 1002                 *retval = 0;
 1003                 error = 0;
 1004         }
 1005         else if ( error == THREAD_TIMED_OUT ) {
 1006                 /* our timeout expired */
 1007                 error = EAGAIN;
 1008         }
 1009         else {
 1010                 /* we were interrupted */
 1011                 if ( abstime > 0 ) {
 1012                         thread_cancel_timer();
 1013                 }
 1014                 error = EINTR;
 1015         }
 1016 
 1017 ExitThisRoutine:
 1018         if ( aiocbpp != NULL )
 1019                 FREE( aiocbpp, M_TEMP );
 1020 
 1021         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
 1022                           (int)p, uap->nent, error, 0, 0 );
 1023         
 1024         return( error );        
 1025 
 1026 } /* aio_suspend */
 1027 
 1028 
 1029 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the 
 1030  * file descriptor (uap->aiocbp->aio_fildes) from the buffer 
 1031  * (uap->aiocbp->aio_buf).
 1032  */
 1033 
 1034 int
 1035 aio_write( struct proc *p, struct aio_write_args *uap, int *retval )
 1036 {
 1037         int                     error;
 1038         
 1039         *retval = 0;
 1040         
 1041         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
 1042                           (int)p, (int)uap->aiocbp, 0, 0, 0 );
 1043 
 1044         error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
 1045         if ( error != 0 )
 1046                 *retval = -1;
 1047 
 1048         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
 1049                           (int)p, (int)uap->aiocbp, error, 0, 0 );
 1050                 
 1051         return( error );
 1052 
 1053 } /* aio_write */
 1054 
 1055 
 1056 /*
 1057  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs
 1058  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
 1059  * The caller gets error and return status for each aiocb in the list via aio_error 
 1060  * and aio_return.  We must keep completed requests until released by the 
 1061  * aio_return call.
 1062  */
 1063 
 1064 int
 1065 lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval )
 1066 {
 1067         int                                                     i;
 1068         int                                                     call_result;
 1069         int                                                     result;
 1070         long                                            group_tag;
 1071         aio_workq_entry *                       *entryp_listp;
 1072 
 1073         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
 1074                           (int)p, uap->nent, uap->mode, 0, 0 );
 1075         
 1076         entryp_listp = NULL;
 1077         call_result = -1;
 1078         *retval = -1;
 1079         if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
 1080                 call_result = EINVAL;
 1081                 goto ExitRoutine;
 1082         }
 1083 
 1084         if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
 1085                 call_result = EINVAL;
 1086                 goto ExitRoutine;
 1087         }
 1088         
 1089         /* 
 1090          * we use group_tag to mark IO requests for delayed completion processing
 1091          * which means we wait until all IO requests in the group have completed 
 1092          * before we either return to the caller when mode is LIO_WAIT or signal
 1093          * user when mode is LIO_NOWAIT. 
 1094          */
 1095         group_tag = random();
 1096                 
 1097         /* 
 1098          * allocate a list of aio_workq_entry pointers that we will use to queue
 1099          * up all our requests at once while holding our lock.
 1100          */
 1101         MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK );
 1102         if ( entryp_listp == NULL ) {
 1103                 call_result = EAGAIN;
 1104                 goto ExitRoutine;
 1105         }
 1106 
 1107         /* process list of aio requests */
 1108         for ( i = 0; i < uap->nent; i++ ) {
 1109                 struct aiocb    *my_aiocbp;
 1110         
 1111                 *(entryp_listp + i) = NULL;
 1112                 
 1113                 /* copyin in aiocb pointer from list */
 1114                 result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) );
 1115                 if ( result != 0 ) {
 1116                         call_result = EAGAIN;
 1117                         continue;
 1118                 }
 1119         
 1120                 /* NULL elements are legal so check for 'em */
 1121                 if ( my_aiocbp == NULL )
 1122                         continue;
 1123 
 1124                 if ( uap->mode == LIO_NOWAIT )
 1125                         result = lio_create_async_entry( p, my_aiocbp, uap->sigp, 
 1126                                                                                          group_tag, (entryp_listp + i) );
 1127                 else
 1128                         result = lio_create_sync_entry( p, my_aiocbp, group_tag, 
 1129                                                                                         (entryp_listp + i) );
 1130 
 1131                 if ( result != 0 && call_result == -1 )
 1132                         call_result = result;
 1133         }
 1134 
 1135         /* 
 1136          * we need to protect this section since we do not want any of these grouped 
 1137          * IO requests to begin until we have them all on the queue.
 1138          */
 1139         AIO_LOCK;
 1140         for ( i = 0; i < uap->nent; i++ ) {
 1141                 aio_workq_entry                         *entryp;
 1142                 
 1143                 /* NULL elements are legal so check for 'em */
 1144                 entryp = *(entryp_listp + i);
 1145                 if ( entryp == NULL )
 1146                         continue;
 1147 
 1148                 /* check our aio limits to throttle bad or rude user land behavior */
 1149                 if ( aio_get_all_queues_count( ) >= aio_max_requests || 
 1150                          aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
 1151                          is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
 1152                         vm_map_t                my_map;
 1153                         
 1154                         my_map = entryp->aio_map;
 1155                         entryp->aio_map = VM_MAP_NULL;
 1156                         result = EAGAIN; 
 1157                         AIO_UNLOCK;
 1158                         aio_free_request( entryp, my_map );
 1159                         AIO_LOCK;
 1160                         continue;
 1161                 }
 1162                 
 1163                 /* place the request on the appropriate queue */
 1164                 if ( uap->mode == LIO_NOWAIT ) {
 1165                         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 1166                         aio_anchor.aio_async_workq_count++;
 1167 
 1168                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
 1169                                           (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
 1170                 }
 1171                 else {
 1172                         TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 1173                         aio_anchor.lio_sync_workq_count++;
 1174                 }
 1175         }
 1176         AIO_UNLOCK;
 1177 
 1178         if ( uap->mode == LIO_NOWAIT ) 
 1179                 /* caller does not want to wait so we'll fire off a worker thread and return */
 1180                 wakeup_one( &aio_anchor.aio_async_workq );
 1181         else {
 1182                 aio_workq_entry                 *entryp;
 1183                 int                                     error;
 1184 
 1185                 /* 
 1186                  * mode is LIO_WAIT - handle the IO requests now.
 1187                  */
 1188                 AIO_LOCK;
 1189                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 1190                 while ( entryp != NULL ) {
 1191                         if ( p == entryp->procp && group_tag == entryp->group_tag ) {
 1192                                 boolean_t       funnel_state;
 1193                                         
 1194                                 TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link );
 1195                                 aio_anchor.lio_sync_workq_count--;
 1196                                 AIO_UNLOCK;
 1197                                 
 1198                                 // file system IO code path requires kernel funnel lock
 1199                                 funnel_state = thread_funnel_set( kernel_flock, TRUE );
 1200                                 if ( (entryp->flags & AIO_READ) != 0 ) {
 1201                                         error = do_aio_read( entryp );
 1202                                 }
 1203                                 else if ( (entryp->flags & AIO_WRITE) != 0 ) {
 1204                                         error = do_aio_write( entryp );
 1205                                 }
 1206                                 else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
 1207                                         error = do_aio_fsync( entryp );
 1208                                 }
 1209                                 else {
 1210                                         printf( "%s - unknown aio request - flags 0x%02X \n", 
 1211                                                         __FUNCTION__, entryp->flags );
 1212                                         error = EINVAL;
 1213                                 }
 1214                                 entryp->errorval = error;       
 1215                                 if ( error != 0 && call_result == -1 )
 1216                                         call_result = EIO;
 1217                                 (void) thread_funnel_set( kernel_flock, funnel_state );
 1218 
 1219                                 AIO_LOCK;
 1220                                 /* we're done with the IO request so move it on the done queue */
 1221                                 TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link );
 1222                                 aio_anchor.aio_done_count++;
 1223                                 p->aio_done_count++;
 1224 
 1225                                 /* need to start over since lio_sync_workq may have been changed while we */
 1226                                 /* were away doing the IO.  */
 1227                                 entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq );
 1228                                 continue;
 1229                         } /* p == entryp->procp */
 1230                         
 1231                         entryp = TAILQ_NEXT( entryp, aio_workq_link );
 1232         } /* while ( entryp != NULL ) */
 1233                 AIO_UNLOCK;
 1234         } /* uap->mode == LIO_WAIT */
 1235 
 1236         /* call_result == -1 means we had no trouble queueing up requests */
 1237         if ( call_result == -1 ) {
 1238                 call_result = 0;
 1239                 *retval = 0;
 1240         }
 1241 
 1242 ExitRoutine:            
 1243         if ( entryp_listp != NULL )
 1244                 FREE( entryp_listp, M_TEMP );
 1245 
 1246         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
 1247                           (int)p, call_result, 0, 0, 0 );
 1248         
 1249         return( call_result );
 1250         
 1251 } /* lio_listio */
 1252 
 1253 
 1254 /*
 1255  * aio worker thread.  this is where all the real work gets done.
 1256  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq 
 1257  * after new work is queued up.
 1258  */
 1259 
 1260 static void
 1261 aio_work_thread( void )
 1262 {
 1263         aio_workq_entry                 *entryp;
 1264         struct uthread                  *uthread = (struct uthread *)get_bsdthread_info(current_act());
 1265         
 1266         for( ;; ) {
 1267                 entryp = aio_get_some_work();
 1268         if ( entryp == NULL ) {
 1269                 /* 
 1270                  * aio worker threads wait for some work to get queued up 
 1271                  * by aio_queue_async_request.  Once some work gets queued 
 1272                  * it will wake up one of these worker threads just before 
 1273                  * returning to our caller in user land.   We do not use
 1274                          * tsleep() here in order to avoid getting kernel funnel lock.
 1275                  */
 1276                         assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT );
 1277                         thread_block( THREAD_CONTINUE_NULL );
 1278                         
 1279                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE,
 1280                                                   0, 0, 0, 0, 0 );
 1281         }
 1282                 else {
 1283                         int                     error;
 1284                         boolean_t               funnel_state;
 1285                         vm_map_t                currentmap;
 1286                         vm_map_t                oldmap = VM_MAP_NULL;
 1287                         task_t                  oldaiotask = TASK_NULL;
 1288 
 1289                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
 1290                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
 1291                         
 1292                         /*
 1293                          * Assume the target's address space identity for the duration
 1294                          * of the IO.
 1295                          */
 1296                         funnel_state = thread_funnel_set( kernel_flock, TRUE );
 1297                         
 1298                         currentmap = get_task_map( (current_proc())->task );
 1299                         if ( currentmap != entryp->aio_map ) {
 1300                                 oldaiotask = uthread->uu_aio_task;
 1301                                 uthread->uu_aio_task = entryp->procp->task;
 1302                                 oldmap = vm_map_switch( entryp->aio_map );
 1303                         }
 1304                         
 1305                         if ( (entryp->flags & AIO_READ) != 0 ) {
 1306                                 error = do_aio_read( entryp );
 1307                         }
 1308                         else if ( (entryp->flags & AIO_WRITE) != 0 ) {
 1309                                 error = do_aio_write( entryp );
 1310                         }
 1311                         else if ( (entryp->flags & AIO_FSYNC) != 0 ) {
 1312                                 error = do_aio_fsync( entryp );
 1313                         }
 1314                         else {
 1315                                 printf( "%s - unknown aio request - flags 0x%02X \n", 
 1316                                                 __FUNCTION__, entryp->flags );
 1317                                 error = EINVAL;
 1318                         }
 1319                         entryp->errorval = error;               
 1320                         if ( currentmap != entryp->aio_map ) {
 1321                                 (void) vm_map_switch( oldmap );
 1322                                 uthread->uu_aio_task = oldaiotask;
 1323                         }
 1324                                 
 1325                         /* we're done with the IO request so pop it off the active queue and */
 1326                         /* push it on the done queue */
 1327                         AIO_LOCK;
 1328                         TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link );
 1329                         aio_anchor.aio_active_count--;
 1330                         entryp->procp->aio_active_count--;
 1331                         TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link );
 1332                         aio_anchor.aio_done_count++;
 1333                         entryp->procp->aio_done_count++;
 1334                         entryp->flags |= AIO_COMPLETION;
 1335 
 1336                         /* remove our reference to the user land map. */
 1337                         if ( VM_MAP_NULL != entryp->aio_map ) {
 1338                                 vm_map_t                my_map;
 1339                                 
 1340                                 my_map = entryp->aio_map;
 1341                                 entryp->aio_map = VM_MAP_NULL;
 1342                                 AIO_UNLOCK;  /* must unlock before calling vm_map_deallocate() */
 1343                                 vm_map_deallocate( my_map );
 1344                         }
 1345                         else {
 1346                                 AIO_UNLOCK;
 1347                         }
 1348                         
 1349                         do_aio_completion( entryp );
 1350                         (void) thread_funnel_set( kernel_flock, funnel_state );
 1351                         
 1352                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
 1353                                                   (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval, 
 1354                                                   entryp->returnval, 0 );
 1355                         
 1356                         AIO_LOCK;
 1357                         entryp->flags &= ~AIO_COMPLETION;
 1358                         if ( (entryp->flags & AIO_DO_FREE) != 0 ) {
 1359                                 vm_map_t                my_map;
 1360                         
 1361                                 my_map = entryp->aio_map;
 1362                                 entryp->aio_map = VM_MAP_NULL;
 1363                                 AIO_UNLOCK;
 1364                                 aio_free_request( entryp, my_map );
 1365                         }
 1366                         else
 1367                                 AIO_UNLOCK;
 1368                 }
 1369         } /* for ( ;; ) */
 1370 
 1371         /* NOT REACHED */
 1372         
 1373 } /* aio_work_thread */
 1374 
 1375 
 1376 /*
 1377  * aio_get_some_work - get the next async IO request that is ready to be executed.
 1378  * aio_fsync complicates matters a bit since we cannot do the fsync until all async
 1379  * IO requests at the time the aio_fsync call came in have completed.
 1380  */
 1381 
 1382 static aio_workq_entry *
 1383 aio_get_some_work( void )
 1384 {
 1385         aio_workq_entry                         *entryp;
 1386         int                                                     skip_count = 0;
 1387         
 1388         /* pop some work off the work queue and add to our active queue */
 1389         AIO_LOCK;
 1390         for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq );
 1391                   entryp != NULL;
 1392                   entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) {
 1393 
 1394                 if ( (entryp->flags & AIO_FSYNC) != 0 ) {
 1395                         /* leave aio_fsync calls on the work queue if there are IO */
 1396                         /* requests on the active queue for the same file descriptor. */
 1397                         if ( aio_delay_fsync_request( entryp ) ) {
 1398 
 1399                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
 1400                                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
 1401                                 continue;
 1402                         }
 1403                 }
 1404                 break;
 1405         }
 1406         
 1407         if ( entryp != NULL ) {
 1408                 TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 1409                 aio_anchor.aio_async_workq_count--;
 1410                 TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link );
 1411                 aio_anchor.aio_active_count++;
 1412                 entryp->procp->aio_active_count++;
 1413         }
 1414         AIO_UNLOCK;
 1415                 
 1416         return( entryp );
 1417         
 1418 } /* aio_get_some_work */
 1419 
 1420 
 1421 /*
 1422  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
 1423  * this time.  Delay will happen when there are any active IOs for the same file 
 1424  * descriptor that were queued at time the aio_sync call was queued.  
 1425  * NOTE - AIO_LOCK must be held by caller
 1426  */
 1427 static boolean_t
 1428 aio_delay_fsync_request( aio_workq_entry *entryp )
 1429 {
 1430         aio_workq_entry                 *my_entryp;
 1431 
 1432         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
 1433                 if ( my_entryp->fsyncp != NULL &&
 1434                          entryp->uaiocbp == my_entryp->fsyncp &&
 1435                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
 1436                         return( TRUE );
 1437                 }
 1438         }
 1439                 
 1440         return( FALSE );
 1441         
 1442 } /* aio_delay_fsync_request */
 1443 
 1444 
 1445 /*
 1446  * aio_queue_async_request - queue up an async IO request on our work queue then
 1447  * wake up one of our worker threads to do the actual work.  We get a reference
 1448  * to our caller's user land map in order to keep it around while we are
 1449  * processing the request. 
 1450  */
 1451 
 1452 static int
 1453 aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO )
 1454 {
 1455         aio_workq_entry                 *entryp;
 1456         int                                             result;
 1457 
 1458         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
 1459         if ( entryp == NULL ) {
 1460                 result = EAGAIN; 
 1461                 goto error_exit;
 1462         }
 1463         bzero( entryp, sizeof(*entryp) );
 1464 
 1465         /* fill in the rest of the aio_workq_entry */
 1466         entryp->procp = procp;
 1467         entryp->uaiocbp = aiocbp;
 1468         entryp->flags |= kindOfIO;
 1469         entryp->aio_map = VM_MAP_NULL;
 1470         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
 1471         if ( result != 0 ) {
 1472                 result = EAGAIN;
 1473                 goto error_exit;
 1474         }
 1475 
 1476         /* do some more validation on the aiocb and embedded file descriptor */
 1477         result = aio_validate( entryp );
 1478         if ( result != 0 ) 
 1479                 goto error_exit;
 1480 
 1481         /* get a reference to the user land map in order to keep it around */
 1482         entryp->aio_map = get_task_map( procp->task );
 1483         vm_map_reference( entryp->aio_map );
 1484 
 1485         AIO_LOCK;
 1486 
 1487         if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
 1488                 AIO_UNLOCK;
 1489                 result = EAGAIN; 
 1490                 goto error_exit;
 1491         }
 1492 
 1493         /* check our aio limits to throttle bad or rude user land behavior */
 1494         if ( aio_get_all_queues_count( ) >= aio_max_requests || 
 1495                  aio_get_process_count( procp ) >= aio_max_requests_per_process ) {
 1496                 AIO_UNLOCK;
 1497                 result = EAGAIN; 
 1498                 goto error_exit;
 1499         }
 1500         
 1501         /* 
 1502          * aio_fsync calls sync up all async IO requests queued at the time 
 1503          * the aio_fsync call was made.  So we mark each currently queued async 
 1504          * IO with a matching file descriptor as must complete before we do the 
 1505          * fsync.  We set the fsyncp field of each matching async IO 
 1506          * request with the aiocb pointer passed in on the aio_fsync call to 
 1507          * know which IOs must complete before we process the aio_fsync call. 
 1508          */
 1509         if ( (kindOfIO & AIO_FSYNC) != 0 )
 1510                 aio_mark_requests( entryp );
 1511         
 1512         /* queue up on our aio asynchronous work queue */
 1513         TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
 1514         aio_anchor.aio_async_workq_count++;
 1515         
 1516         AIO_UNLOCK;
 1517 
 1518         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
 1519                           (int)procp, (int)aiocbp, 0, 0, 0 );
 1520 
 1521         wakeup_one( &aio_anchor.aio_async_workq );
 1522 
 1523         return( 0 );
 1524         
 1525 error_exit:
 1526         if ( entryp != NULL ) {
 1527                 /* this entry has not been queued up so no worries about unlocked */
 1528                 /* state and aio_map */
 1529                 aio_free_request( entryp, entryp->aio_map );
 1530         }
 1531                 
 1532         return( result );
 1533         
 1534 } /* aio_queue_async_request */
 1535 
 1536 
 1537 /*
 1538  * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
 1539  * If all goes well return 0 and pass the aio_workq_entry pointer back to
 1540  * our caller.  We get a reference to our caller's user land map in order to keep 
 1541  * it around while we are processing the request.  
 1542  * lio_listio calls behave differently at completion they do completion notification 
 1543  * when all async IO requests have completed.  We use group_tag to tag IO requests 
 1544  * that behave in the delay notification manner. 
 1545  */
 1546 
 1547 static int
 1548 lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp, 
 1549                                                  struct sigevent *sigp, long group_tag,
 1550                                                  aio_workq_entry **entrypp )
 1551 {
 1552         aio_workq_entry                         *entryp;
 1553         int                                                     result;
 1554 
 1555         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
 1556         if ( entryp == NULL ) {
 1557                 result = EAGAIN; 
 1558                 goto error_exit;
 1559         }
 1560         bzero( entryp, sizeof(*entryp) );
 1561 
 1562         /* fill in the rest of the aio_workq_entry */
 1563         entryp->procp = procp;
 1564         entryp->uaiocbp = aiocbp;
 1565         entryp->flags |= AIO_LIO;
 1566         entryp->group_tag = group_tag;
 1567         entryp->aio_map = VM_MAP_NULL;
 1568         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
 1569         if ( result != 0 ) {
 1570                 result = EAGAIN;
 1571                 goto error_exit;
 1572         }
 1573 
 1574         /* look for lio_listio LIO_NOP requests and ignore them. */
 1575         /* Not really an error, but we need to free our aio_workq_entry.  */
 1576         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
 1577                 result = 0;
 1578                 goto error_exit;
 1579         }
 1580 
 1581         /* use sigevent passed in to lio_listio for each of our calls, but only */
 1582         /* do completion notification after the last request completes. */
 1583         if ( sigp != NULL ) {
 1584                 result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) );
 1585                 if ( result != 0 ) {
 1586                         result = EAGAIN;
 1587                         goto error_exit;
 1588                 }
 1589         }
 1590 
 1591         /* do some more validation on the aiocb and embedded file descriptor */
 1592         result = aio_validate( entryp );
 1593         if ( result != 0 ) 
 1594                 goto error_exit;
 1595 
 1596         /* get a reference to the user land map in order to keep it around */
 1597         entryp->aio_map = get_task_map( procp->task );
 1598         vm_map_reference( entryp->aio_map );
 1599         
 1600         *entrypp = entryp;
 1601         return( 0 );
 1602         
 1603 error_exit:
 1604         if ( entryp != NULL )
 1605                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
 1606                 
 1607         return( result );
 1608         
 1609 } /* lio_create_async_entry */
 1610 
 1611 
 1612 /*
 1613  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
 1614  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp
 1615  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp
 1616  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
 1617  * NOTE - AIO_LOCK must be held by caller
 1618  */
 1619 
 1620 static void
 1621 aio_mark_requests( aio_workq_entry *entryp )
 1622 {
 1623         aio_workq_entry                 *my_entryp;
 1624 
 1625         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
 1626                 if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
 1627                         my_entryp->fsyncp = entryp->uaiocbp;
 1628                 }
 1629         }
 1630         
 1631         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 1632                 if ( entryp->procp == my_entryp->procp &&
 1633                          entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) {
 1634                         my_entryp->fsyncp = entryp->uaiocbp;
 1635                 }
 1636         }
 1637                                 
 1638 } /* aio_mark_requests */
 1639 
 1640 
 1641 /*
 1642  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
 1643  * If all goes well return 0 and pass the aio_workq_entry pointer back to
 1644  * our caller.  
 1645  * lio_listio calls behave differently at completion they do completion notification 
 1646  * when all async IO requests have completed.  We use group_tag to tag IO requests 
 1647  * that behave in the delay notification manner. 
 1648  */
 1649 
 1650 static int
 1651 lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp, 
 1652                                                 long group_tag, aio_workq_entry **entrypp )
 1653 {
 1654         aio_workq_entry                         *entryp;
 1655         int                                                     result;
 1656 
 1657         entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
 1658         if ( entryp == NULL ) {
 1659                 result = EAGAIN; 
 1660                 goto error_exit;
 1661         }
 1662         bzero( entryp, sizeof(*entryp) );
 1663 
 1664         /* fill in the rest of the aio_workq_entry */
 1665         entryp->procp = procp;
 1666         entryp->uaiocbp = aiocbp;
 1667         entryp->flags |= AIO_LIO;
 1668         entryp->group_tag = group_tag;
 1669         entryp->aio_map = VM_MAP_NULL;
 1670         result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) );
 1671         if ( result != 0 ) {
 1672                 result = EAGAIN;
 1673                 goto error_exit;
 1674         }
 1675 
 1676         /* look for lio_listio LIO_NOP requests and ignore them. */
 1677         /* Not really an error, but we need to free our aio_workq_entry.  */
 1678         if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
 1679                 result = 0;
 1680                 goto error_exit;
 1681         }
 1682 
 1683         result = aio_validate( entryp );
 1684         if ( result != 0 ) {
 1685                 goto error_exit;
 1686         }
 1687 
 1688         *entrypp = entryp;
 1689         return( 0 );
 1690         
 1691 error_exit:
 1692         if ( entryp != NULL )
 1693                 zfree( aio_workq_zonep, (vm_offset_t) entryp );
 1694                 
 1695         return( result );
 1696         
 1697 } /* lio_create_sync_entry */
 1698 
 1699 
 1700 /*
 1701  * aio_free_request - remove our reference on the user land map and
 1702  * free the work queue entry resources.
 1703  * We are not holding the lock here thus aio_map is passed in and
 1704  * zeroed while we did have the lock.
 1705  */
 1706 
 1707 static int
 1708 aio_free_request( aio_workq_entry *entryp, vm_map_t the_map )
 1709 {
 1710         /* remove our reference to the user land map. */
 1711         if ( VM_MAP_NULL != the_map ) {
 1712                 vm_map_deallocate( the_map );
 1713         }
 1714                 
 1715         zfree( aio_workq_zonep, (vm_offset_t) entryp );
 1716 
 1717         return( 0 );
 1718         
 1719 } /* aio_free_request */
 1720 
 1721 
 1722 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
 1723  */
 1724 
 1725 static int
 1726 aio_validate( aio_workq_entry *entryp ) 
 1727 {
 1728         boolean_t                                       funnel_state;
 1729         struct file                             *fp;
 1730         int                                                     flag;
 1731         int                                                     result;
 1732         
 1733         result = 0;
 1734 
 1735         if ( (entryp->flags & AIO_LIO) != 0 ) {
 1736                 if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
 1737                         entryp->flags |= AIO_READ;
 1738                 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
 1739                         entryp->flags |= AIO_WRITE;
 1740                 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
 1741                         return( 0 );
 1742                 else
 1743                         return( EINVAL );
 1744         }
 1745 
 1746         flag = FREAD;
 1747         if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) {
 1748                 flag = FWRITE;
 1749         }
 1750 
 1751         if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
 1752                 if ( entryp->aiocb.aio_offset < 0                       ||
 1753                          entryp->aiocb.aio_nbytes < 0                   ||
 1754                          entryp->aiocb.aio_nbytes > INT_MAX     ||
 1755                          entryp->aiocb.aio_buf == NULL )
 1756                         return( EINVAL );
 1757         }
 1758 
 1759         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify
 1760          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value, 
 1761          * sigev_notify_function, and sigev_notify_attributes are ignored.
 1762          */
 1763         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) {
 1764                 int             signum;
 1765                 /* make sure we have a valid signal number */
 1766                 signum = entryp->aiocb.aio_sigevent.sigev_signo;
 1767                 if ( signum <= 0 || signum >= NSIG || 
 1768                          signum == SIGKILL || signum == SIGSTOP )
 1769                         return (EINVAL);
 1770         }
 1771         else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE )
 1772                 return (EINVAL);
 1773         
 1774         /* validate the file descriptor and that the file was opened
 1775          * for the appropriate read / write access.  This section requires 
 1776          * kernel funnel lock.
 1777          */
 1778         funnel_state = thread_funnel_set( kernel_flock, TRUE );
 1779 
 1780         result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp );
 1781         if ( result == 0 ) {
 1782                 if ( (fp->f_flag & flag) == 0 ) {
 1783                         /* we don't have read or write access */
 1784                         result = EBADF;
 1785                 }
 1786                 else if ( fp->f_type != DTYPE_VNODE ) {
 1787                         /* this is not a file */
 1788                         result = ESPIPE;
 1789                 }
 1790         }
 1791         else {
 1792                 result = EBADF;
 1793         }
 1794         
 1795         (void) thread_funnel_set( kernel_flock, funnel_state );
 1796 
 1797         return( result );
 1798 
 1799 } /* aio_validate */
 1800 
 1801 
 1802 /*
 1803  * aio_get_process_count - runs through our queues that hold outstanding 
 1804  * async IO reqests and totals up number of requests for the given
 1805  * process. 
 1806  * NOTE - caller must hold aio lock! 
 1807  */
 1808 
 1809 static int
 1810 aio_get_process_count( struct proc *procp ) 
 1811 {
 1812         aio_workq_entry                         *entryp;
 1813         int                                                     error;
 1814         int                                                     count;
 1815         
 1816         /* begin with count of completed async IO requests for this process */
 1817         count = procp->aio_done_count;
 1818         
 1819         /* add in count of active async IO requests for this process */
 1820         count += procp->aio_active_count;
 1821         
 1822         /* look for matches on our queue of asynchronous todo work */
 1823         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 1824                 if ( procp == entryp->procp ) {
 1825                         count++;
 1826                 }
 1827         }
 1828         
 1829         /* look for matches on our queue of synchronous todo work */
 1830         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
 1831                 if ( procp == entryp->procp ) {
 1832                         count++;
 1833                 }
 1834         }
 1835         
 1836         return( count );
 1837         
 1838 } /* aio_get_process_count */
 1839 
 1840 
 1841 /*
 1842  * aio_get_all_queues_count - get total number of entries on all aio work queues.  
 1843  * NOTE - caller must hold aio lock! 
 1844  */
 1845 
 1846 static int
 1847 aio_get_all_queues_count( void ) 
 1848 {
 1849         int                                                     count;
 1850         
 1851         count = aio_anchor.aio_async_workq_count;
 1852         count += aio_anchor.lio_sync_workq_count;
 1853         count += aio_anchor.aio_active_count;
 1854         count += aio_anchor.aio_done_count;
 1855                 
 1856         return( count );
 1857         
 1858 } /* aio_get_all_queues_count */
 1859 
 1860 
 1861 /*
 1862  * do_aio_completion.  Handle async IO completion.  
 1863  */
 1864 
 1865 static void
 1866 do_aio_completion( aio_workq_entry *entryp ) 
 1867 {
 1868         /* signal user land process if appropriate */
 1869         if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
 1870                  (entryp->flags & AIO_DISABLE) == 0 ) {
 1871 
 1872                 /* 
 1873                  * if group_tag is non zero then make sure this is the last IO request
 1874                  * in the group before we signal.
 1875                  */
 1876                 if ( entryp->group_tag == 0 || 
 1877                          (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) {
 1878                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
 1879                                                   (int)entryp->procp, (int)entryp->uaiocbp, 
 1880                                                   entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
 1881                         
 1882                         psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
 1883                         return;
 1884                 }
 1885         }
 1886 
 1887         /*
 1888          * need to handle case where a process is trying to exit, exec, or close
 1889          * and is currently waiting for active aio requests to complete.  If  
 1890          * AIO_WAITING is set then we need to look to see if there are any 
 1891          * other requests in the active queue for this process.  If there are 
 1892          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If 
 1893          * there are some still active then do nothing - we only want to wakeup 
 1894          * when all active aio requests for the process are complete. 
 1895          */
 1896         if ( (entryp->flags & AIO_WAITING) != 0 ) {
 1897                 int             active_requests;
 1898 
 1899                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
 1900                                           (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
 1901                 
 1902                 AIO_LOCK;
 1903                 active_requests = aio_active_requests_for_process( entryp->procp );
 1904                 AIO_UNLOCK;
 1905                 if ( active_requests < 1 ) {
 1906                         /* no active aio requests for this process, continue exiting */
 1907 
 1908                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
 1909                                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
 1910                 
 1911                         wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN );
 1912                 }
 1913                 return;
 1914         }
 1915 
 1916         /* 
 1917          * aio_suspend case when a signal was not requested.  In that scenario we  
 1918          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.   
 1919          * NOTE - the assumption here is that this wakeup call is inexpensive.
 1920          * we really only need to do this when an aio_suspend call is pending.
 1921          * If we find the wakeup call should be avoided we could mark the 
 1922          * async IO requests given in the list provided by aio_suspend and only
 1923          * call wakeup for them.  If we do mark them we should unmark them after
 1924          * the aio_suspend wakes up.
 1925          */
 1926         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
 1927                                   (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
 1928                 
 1929         wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN ); 
 1930         
 1931         return;
 1932         
 1933 } /* do_aio_completion */
 1934 
 1935 
 1936 /*
 1937  * aio_last_group_io - checks to see if this is the last unfinished IO request
 1938  * for the given group_tag.  Returns TRUE if there are no other active IO 
 1939  * requests for this group or FALSE if the are active IO requests 
 1940  * NOTE - AIO_LOCK must be held by caller
 1941  */
 1942 
 1943 static boolean_t
 1944 aio_last_group_io( aio_workq_entry *entryp ) 
 1945 {
 1946         aio_workq_entry                         *my_entryp;
 1947                         
 1948         /* look for matches on our queue of active async IO requests */
 1949         TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) {
 1950                 if ( my_entryp->group_tag == entryp->group_tag )
 1951                         return( FALSE );
 1952         }
 1953         
 1954         /* look for matches on our queue of asynchronous todo work */
 1955         TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 1956                 if ( my_entryp->group_tag == entryp->group_tag )
 1957                         return( FALSE );
 1958         }
 1959         
 1960         /* look for matches on our queue of synchronous todo work */
 1961         TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
 1962                 if ( my_entryp->group_tag == entryp->group_tag )
 1963                         return( FALSE );
 1964         }
 1965 
 1966         return( TRUE );
 1967         
 1968 } /* aio_last_group_io */
 1969 
 1970 
 1971 /*
 1972  * do_aio_read
 1973  */
 1974 static int
 1975 do_aio_read( aio_workq_entry *entryp )
 1976 {
 1977         struct file                     *fp;
 1978         int                                             error;
 1979 
 1980         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD );
 1981         if ( fp != NULL ) {
 1982                 error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes, 
 1983                                                         (void *)entryp->aiocb.aio_buf, 
 1984                                                         entryp->aiocb.aio_nbytes,
 1985                                                         entryp->aiocb.aio_offset, FOF_OFFSET, 
 1986                                                         &entryp->returnval );
 1987                 frele( fp );
 1988         }
 1989         else
 1990                 error = EBADF;
 1991                         
 1992         return( error );
 1993         
 1994 } /* do_aio_read */
 1995 
 1996 
 1997 /*
 1998  * do_aio_write
 1999  */
 2000 static int
 2001 do_aio_write( aio_workq_entry *entryp )
 2002 {
 2003         struct file                     *fp;
 2004         int                                             error;
 2005 
 2006         fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE );
 2007         if ( fp != NULL ) {
 2008                 error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes, 
 2009                                                          (const void *)entryp->aiocb.aio_buf, 
 2010                                                          entryp->aiocb.aio_nbytes,
 2011                                                          entryp->aiocb.aio_offset, FOF_OFFSET, 
 2012                                                          &entryp->returnval );
 2013                 frele( fp );
 2014         }
 2015         else
 2016                 error = EBADF;
 2017 
 2018         return( error );
 2019 
 2020 } /* do_aio_write */
 2021 
 2022 
 2023 /*
 2024  * aio_active_requests_for_process - return number of active async IO
 2025  * requests for the given process.
 2026  * NOTE - caller must hold aio lock!
 2027  */
 2028 
 2029 static int
 2030 aio_active_requests_for_process( struct proc *procp )
 2031 {
 2032                                 
 2033         return( procp->aio_active_count );
 2034 
 2035 } /* aio_active_requests_for_process */
 2036 
 2037 
 2038 /*
 2039  * do_aio_fsync
 2040  */
 2041 static int
 2042 do_aio_fsync( aio_workq_entry *entryp )
 2043 {
 2044         register struct vnode   *vp;
 2045         struct file                     *fp;
 2046         int                                             error;
 2047         
 2048         /* 
 2049          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.  
 2050          * AIO_DSYNC is caught before we queue up a request and flagged as an error.  
 2051          * The following was shamelessly extracted from fsync() implementation. 
 2052          */
 2053         error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp );
 2054         if ( error == 0 ) {
 2055                 vp = (struct vnode *)fp->f_data;
 2056                 vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp );
 2057                 error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp );
 2058                 VOP_UNLOCK( vp, 0, entryp->procp );
 2059         }
 2060         if ( error != 0 )
 2061                 entryp->returnval = -1;
 2062 
 2063         return( error );
 2064                 
 2065 } /* do_aio_fsync */
 2066 
 2067 
 2068 /*
 2069  * is_already_queued - runs through our queues to see if the given  
 2070  * aiocbp / process is there.  Returns TRUE if there is a match
 2071  * on any of our aio queues.
 2072  * NOTE - callers must hold aio lock!
 2073  */
 2074 
 2075 static boolean_t
 2076 is_already_queued(      struct proc *procp, 
 2077                                         struct aiocb *aiocbp ) 
 2078 {
 2079         aio_workq_entry                 *entryp;
 2080         boolean_t                               result;
 2081                 
 2082         result = FALSE;
 2083                 
 2084         /* look for matches on our queue of async IO requests that have completed */
 2085         TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) {
 2086                 if ( aiocbp == entryp->uaiocbp ) {
 2087                         result = TRUE;
 2088                         goto ExitThisRoutine;
 2089                 }
 2090         }
 2091         
 2092         /* look for matches on our queue of active async IO requests */
 2093         TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) {
 2094                 if ( aiocbp == entryp->uaiocbp ) {
 2095                         result = TRUE;
 2096                         goto ExitThisRoutine;
 2097                 }
 2098         }
 2099         
 2100         /* look for matches on our queue of asynchronous todo work */
 2101         TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) {
 2102                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
 2103                         result = TRUE;
 2104                         goto ExitThisRoutine;
 2105                 }
 2106         }
 2107         
 2108         /* look for matches on our queue of synchronous todo work */
 2109         TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) {
 2110                 if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) {
 2111                         result = TRUE;
 2112                         goto ExitThisRoutine;
 2113                 }
 2114         }
 2115 
 2116 ExitThisRoutine:
 2117         return( result );
 2118         
 2119 } /* is_already_queued */
 2120 
 2121 
 2122 /*
 2123  * aio initialization
 2124  */
 2125 __private_extern__ void
 2126 aio_init( void )
 2127 {
 2128         int                     i;
 2129         
 2130         simple_lock_init( &aio_lock );
 2131 
 2132         AIO_LOCK;
 2133         TAILQ_INIT( &aio_anchor.aio_async_workq );      
 2134         TAILQ_INIT( &aio_anchor.lio_sync_workq );       
 2135         aio_anchor.aio_async_workq_count = 0;
 2136         aio_anchor.lio_sync_workq_count = 0;
 2137         aio_anchor.aio_active_count = 0;
 2138         aio_anchor.aio_done_count = 0;
 2139         AIO_UNLOCK;
 2140 
 2141         i = sizeof( aio_workq_entry );
 2142         aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
 2143                 
 2144         _aio_create_worker_threads( aio_worker_threads );
 2145 
 2146         return;
 2147         
 2148 } /* aio_init */
 2149 
 2150 
 2151 /*
 2152  * aio worker threads created here.
 2153  */
 2154 __private_extern__ void
 2155 _aio_create_worker_threads( int num )
 2156 {
 2157         int                     i;
 2158         
 2159         /* create some worker threads to handle the async IO requests */
 2160         for ( i = 0; i < num; i++ ) {
 2161                 thread_t                myThread;
 2162                 
 2163                 myThread = kernel_thread( kernel_task, aio_work_thread );
 2164                 if ( THREAD_NULL == myThread ) {
 2165                         printf( "%s - failed to create a work thread \n", __FUNCTION__ ); 
 2166                 }
 2167         }
 2168         
 2169         return;
 2170         
 2171 } /* _aio_create_worker_threads */
 2172 
 2173 /*
 2174  * Return the current activation utask
 2175  */
 2176 task_t
 2177 get_aiotask(void)
 2178 {
 2179         return  ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task;  
 2180 }

Cache object: ca2daafb6b1e7630ca0c457de14c12a5


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.