The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_aio.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
    3  *
    4  * Redistribution and use in source and binary forms, with or without
    5  * modification, are permitted provided that the following conditions
    6  * are met:
    7  * 1. Redistributions of source code must retain the above copyright
    8  *    notice, this list of conditions and the following disclaimer.
    9  * 2. John S. Dyson's name may not be used to endorse or promote products
   10  *    derived from this software without specific prior written permission.
   11  *
   12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
   13  * bad that happens because of using this software isn't the responsibility
   14  * of the author.  This software is distributed AS-IS.
   15  *
   16  * $FreeBSD$
   17  */
   18 
   19 /*
   20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
   21  */
   22 
   23 #include <sys/param.h>
   24 #include <sys/systm.h>
   25 #include <sys/buf.h>
   26 #include <sys/sysproto.h>
   27 #include <sys/filedesc.h>
   28 #include <sys/kernel.h>
   29 #include <sys/fcntl.h>
   30 #include <sys/file.h>
   31 #include <sys/lock.h>
   32 #include <sys/unistd.h>
   33 #include <sys/proc.h>
   34 #include <sys/resourcevar.h>
   35 #include <sys/signalvar.h>
   36 #include <sys/protosw.h>
   37 #include <sys/socketvar.h>
   38 #include <sys/sysctl.h>
   39 #include <sys/vnode.h>
   40 #include <sys/conf.h>
   41 #include <sys/event.h>
   42 
   43 #include <vm/vm.h>
   44 #include <vm/vm_extern.h>
   45 #include <vm/pmap.h>
   46 #include <vm/vm_map.h>
   47 #include <vm/vm_zone.h>
   48 #include <sys/aio.h>
   49 
   50 #include <machine/limits.h>
   51 #include "opt_vfs_aio.h"
   52 
   53 #ifdef VFS_AIO
   54 
   55 /*
   56  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
   57  * overflow.
   58  */
   59 static  long jobrefid;
   60 
   61 #define JOBST_NULL              0x0
   62 #define JOBST_JOBQGLOBAL        0x2
   63 #define JOBST_JOBRUNNING        0x3
   64 #define JOBST_JOBFINISHED       0x4
   65 #define JOBST_JOBQBUF           0x5
   66 #define JOBST_JOBBFINISHED      0x6
   67 
   68 #ifndef MAX_AIO_PER_PROC
   69 #define MAX_AIO_PER_PROC        32
   70 #endif
   71 
   72 #ifndef MAX_AIO_QUEUE_PER_PROC
   73 #define MAX_AIO_QUEUE_PER_PROC  256 /* Bigger than AIO_LISTIO_MAX */
   74 #endif
   75 
   76 #ifndef MAX_AIO_PROCS
   77 #define MAX_AIO_PROCS           32
   78 #endif
   79 
   80 #ifndef MAX_AIO_QUEUE
   81 #define MAX_AIO_QUEUE           1024 /* Bigger than AIO_LISTIO_MAX */
   82 #endif
   83 
   84 #ifndef TARGET_AIO_PROCS
   85 #define TARGET_AIO_PROCS        4
   86 #endif
   87 
   88 #ifndef MAX_BUF_AIO
   89 #define MAX_BUF_AIO             16
   90 #endif
   91 
   92 #ifndef AIOD_TIMEOUT_DEFAULT
   93 #define AIOD_TIMEOUT_DEFAULT    (10 * hz)
   94 #endif
   95 
   96 #ifndef AIOD_LIFETIME_DEFAULT
   97 #define AIOD_LIFETIME_DEFAULT   (30 * hz)
   98 #endif
   99 
  100 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
  101 
  102 static int max_aio_procs = MAX_AIO_PROCS;
  103 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
  104         CTLFLAG_RW, &max_aio_procs, 0,
  105         "Maximum number of kernel threads to use for handling async IO");
  106 
  107 static int num_aio_procs = 0;
  108 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
  109         CTLFLAG_RD, &num_aio_procs, 0,
  110         "Number of presently active kernel threads for async IO");
  111 
  112 /*
  113  * The code will adjust the actual number of AIO processes towards this
  114  * number when it gets a chance.
  115  */
  116 static int target_aio_procs = TARGET_AIO_PROCS;
  117 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
  118         0, "Preferred number of ready kernel threads for async IO");
  119 
  120 static int max_queue_count = MAX_AIO_QUEUE;
  121 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
  122     "Maximum number of aio requests to queue, globally");
  123 
  124 static int num_queue_count = 0;
  125 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
  126     "Number of queued aio requests");
  127 
  128 static int num_buf_aio = 0;
  129 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
  130     "Number of aio requests presently handled by the buf subsystem");
  131 
  132 /* Number of async I/O thread in the process of being started */
  133 /* XXX This should be local to _aio_aqueue() */
  134 static int num_aio_resv_start = 0;
  135 
  136 static int aiod_timeout;
  137 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
  138     "Timeout value for synchronous aio operations");
  139 
  140 static int aiod_lifetime;
  141 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
  142     "Maximum lifetime for idle aiod");
  143 
  144 static int max_aio_per_proc = MAX_AIO_PER_PROC;
  145 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
  146     0, "Maximum active aio requests per process (stored in the process)");
  147 
  148 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
  149 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
  150     &max_aio_queue_per_proc, 0,
  151     "Maximum queued aio requests per process (stored in the process)");
  152 
  153 static int max_buf_aio = MAX_BUF_AIO;
  154 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
  155     "Maximum buf aio requests per process (stored in the process)");
  156 
  157 /*
  158  * AIO process info
  159  */
  160 #define AIOP_FREE       0x1                     /* proc on free queue */
  161 #define AIOP_SCHED      0x2                     /* proc explicitly scheduled */
  162 
  163 struct aioproclist {
  164         int aioprocflags;                       /* AIO proc flags */
  165         TAILQ_ENTRY(aioproclist) list;          /* List of processes */
  166         struct proc *aioproc;                   /* The AIO thread */
  167 };
  168 
  169 /*
  170  * data-structure for lio signal management
  171  */
  172 struct aio_liojob {
  173         int     lioj_flags;
  174         int     lioj_buffer_count;
  175         int     lioj_buffer_finished_count;
  176         int     lioj_queue_count;
  177         int     lioj_queue_finished_count;
  178         struct  sigevent lioj_signal;   /* signal on all I/O done */
  179         TAILQ_ENTRY(aio_liojob) lioj_list;
  180         struct  kaioinfo *lioj_ki;
  181 };
  182 #define LIOJ_SIGNAL             0x1     /* signal on all done (lio) */
  183 #define LIOJ_SIGNAL_POSTED      0x2     /* signal has been posted */
  184 
  185 /*
  186  * per process aio data structure
  187  */
  188 struct kaioinfo {
  189         int     kaio_flags;             /* per process kaio flags */
  190         int     kaio_maxactive_count;   /* maximum number of AIOs */
  191         int     kaio_active_count;      /* number of currently used AIOs */
  192         int     kaio_qallowed_count;    /* maxiumu size of AIO queue */
  193         int     kaio_queue_count;       /* size of AIO queue */
  194         int     kaio_ballowed_count;    /* maximum number of buffers */
  195         int     kaio_queue_finished_count; /* number of daemon jobs finished */
  196         int     kaio_buffer_count;      /* number of physio buffers */
  197         int     kaio_buffer_finished_count; /* count of I/O done */
  198         struct  proc *kaio_p;           /* process that uses this kaio block */
  199         TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
  200         TAILQ_HEAD(,aiocblist) kaio_jobqueue;   /* job queue for process */
  201         TAILQ_HEAD(,aiocblist) kaio_jobdone;    /* done queue for process */
  202         TAILQ_HEAD(,aiocblist) kaio_bufqueue;   /* buffer job queue for process */
  203         TAILQ_HEAD(,aiocblist) kaio_bufdone;    /* buffer done queue for process */
  204         TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* queue for aios waiting on sockets */
  205 };
  206 
  207 #define KAIO_RUNDOWN    0x1     /* process is being run down */
  208 #define KAIO_WAKEUP     0x2     /* wakeup process when there is a significant event */
  209 
  210 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
  211 static TAILQ_HEAD(,aiocblist) aio_jobs;                 /* Async job list */
  212 static TAILQ_HEAD(,aiocblist) aio_bufjobs;              /* Phys I/O job list */
  213 static TAILQ_HEAD(,aiocblist) aio_freejobs;             /* Pool of free jobs */
  214 
  215 static void     aio_init_aioinfo(struct proc *p);
  216 static void     aio_onceonly(void *);
  217 static int      aio_free_entry(struct aiocblist *aiocbe);
  218 static void     aio_process(struct aiocblist *aiocbe);
  219 static int      aio_newproc(void);
  220 static int      aio_aqueue(struct proc *p, struct aiocb *job, int type);
  221 static void     aio_physwakeup(struct buf *bp);
  222 static int      aio_fphysio(struct aiocblist *aiocbe);
  223 static int      aio_qphysio(struct proc *p, struct aiocblist *iocb);
  224 static void     aio_daemon(void *uproc);
  225 static void     process_signal(void *aioj);
  226 
  227 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
  228 
  229 /*
  230  * Zones for:
  231  *      kaio    Per process async io info
  232  *      aiop    async io thread data
  233  *      aiocb   async io jobs
  234  *      aiol    list io job pointer - internal to aio_suspend XXX
  235  *      aiolio  list io jobs
  236  */
  237 static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
  238 
  239 /*
  240  * Startup initialization
  241  */
  242 static void
  243 aio_onceonly(void *na)
  244 {
  245         TAILQ_INIT(&aio_freeproc);
  246         TAILQ_INIT(&aio_activeproc);
  247         TAILQ_INIT(&aio_jobs);
  248         TAILQ_INIT(&aio_bufjobs);
  249         TAILQ_INIT(&aio_freejobs);
  250         kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
  251         aiop_zone = zinit("AIOP", sizeof(struct aioproclist), 0, 0, 1);
  252         aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
  253         aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
  254         aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1);
  255         aiod_timeout = AIOD_TIMEOUT_DEFAULT;
  256         aiod_lifetime = AIOD_LIFETIME_DEFAULT;
  257         jobrefid = 1;
  258 }
  259 
  260 /*
  261  * Init the per-process aioinfo structure.  The aioinfo limits are set
  262  * per-process for user limit (resource) management.
  263  */
  264 static void
  265 aio_init_aioinfo(struct proc *p)
  266 {
  267         struct kaioinfo *ki;
  268         if (p->p_aioinfo == NULL) {
  269                 ki = zalloc(kaio_zone);
  270                 p->p_aioinfo = ki;
  271                 ki->kaio_flags = 0;
  272                 ki->kaio_maxactive_count = max_aio_per_proc;
  273                 ki->kaio_active_count = 0;
  274                 ki->kaio_qallowed_count = max_aio_queue_per_proc;
  275                 ki->kaio_queue_count = 0;
  276                 ki->kaio_ballowed_count = max_buf_aio;
  277                 ki->kaio_buffer_count = 0;
  278                 ki->kaio_buffer_finished_count = 0;
  279                 ki->kaio_p = p;
  280                 TAILQ_INIT(&ki->kaio_jobdone);
  281                 TAILQ_INIT(&ki->kaio_jobqueue);
  282                 TAILQ_INIT(&ki->kaio_bufdone);
  283                 TAILQ_INIT(&ki->kaio_bufqueue);
  284                 TAILQ_INIT(&ki->kaio_liojoblist);
  285                 TAILQ_INIT(&ki->kaio_sockqueue);
  286         }
  287         
  288         while (num_aio_procs < target_aio_procs)
  289                 aio_newproc();
  290 }
  291 
  292 /*
  293  * Free a job entry.  Wait for completion if it is currently active, but don't
  294  * delay forever.  If we delay, we return a flag that says that we have to
  295  * restart the queue scan.
  296  */
  297 static int
  298 aio_free_entry(struct aiocblist *aiocbe)
  299 {
  300         struct kaioinfo *ki;
  301         struct aio_liojob *lj;
  302         struct proc *p;
  303         int error;
  304         int s;
  305 
  306         if (aiocbe->jobstate == JOBST_NULL)
  307                 panic("aio_free_entry: freeing already free job");
  308 
  309         p = aiocbe->userproc;
  310         ki = p->p_aioinfo;
  311         lj = aiocbe->lio;
  312         if (ki == NULL)
  313                 panic("aio_free_entry: missing p->p_aioinfo");
  314 
  315         while (aiocbe->jobstate == JOBST_JOBRUNNING) {
  316                 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
  317                 tsleep(aiocbe, PRIBIO, "jobwai", 0);
  318         }
  319         if (aiocbe->bp == NULL) {
  320                 if (ki->kaio_queue_count <= 0)
  321                         panic("aio_free_entry: process queue size <= 0");
  322                 if (num_queue_count <= 0)
  323                         panic("aio_free_entry: system wide queue size <= 0");
  324         
  325                 if (lj) {
  326                         lj->lioj_queue_count--;
  327                         if (aiocbe->jobflags & AIOCBLIST_DONE)
  328                                 lj->lioj_queue_finished_count--;
  329                 }
  330                 ki->kaio_queue_count--;
  331                 if (aiocbe->jobflags & AIOCBLIST_DONE)
  332                         ki->kaio_queue_finished_count--;
  333                 num_queue_count--;
  334         } else {
  335                 if (lj) {
  336                         lj->lioj_buffer_count--;
  337                         if (aiocbe->jobflags & AIOCBLIST_DONE)
  338                                 lj->lioj_buffer_finished_count--;
  339                 }
  340                 if (aiocbe->jobflags & AIOCBLIST_DONE)
  341                         ki->kaio_buffer_finished_count--;
  342                 ki->kaio_buffer_count--;
  343                 num_buf_aio--;
  344         }
  345 
  346         /* aiocbe is going away, we need to destroy any knotes */
  347         knote_remove(p, &aiocbe->klist);
  348 
  349         if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
  350             && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
  351                 ki->kaio_flags &= ~KAIO_WAKEUP;
  352                 wakeup(p);
  353         }
  354 
  355         if (aiocbe->jobstate == JOBST_JOBQBUF) {
  356                 if ((error = aio_fphysio(aiocbe)) != 0)
  357                         return error;
  358                 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
  359                         panic("aio_free_entry: invalid physio finish-up state");
  360                 s = splbio();
  361                 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
  362                 splx(s);
  363         } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
  364                 s = splnet();
  365                 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
  366                 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
  367                 splx(s);
  368         } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
  369                 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
  370         else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
  371                 s = splbio();
  372                 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
  373                 splx(s);
  374                 if (aiocbe->bp) {
  375                         vunmapbuf(aiocbe->bp);
  376                         relpbuf(aiocbe->bp, NULL);
  377                         aiocbe->bp = NULL;
  378                 }
  379         }
  380         if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
  381                 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
  382                 zfree(aiolio_zone, lj);
  383         }
  384         aiocbe->jobstate = JOBST_NULL;
  385         untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
  386         fdrop(aiocbe->fd_file, curproc);
  387         TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
  388         return 0;
  389 }
  390 #endif /* VFS_AIO */
  391 
  392 /*
  393  * Rundown the jobs for a given process.  
  394  */
  395 void
  396 aio_proc_rundown(struct proc *p)
  397 {
  398 #ifndef VFS_AIO
  399         return;
  400 #else
  401         int s;
  402         struct kaioinfo *ki;
  403         struct aio_liojob *lj, *ljn;
  404         struct aiocblist *aiocbe, *aiocbn;
  405         struct file *fp;
  406         struct socket *so;
  407 
  408         ki = p->p_aioinfo;
  409         if (ki == NULL)
  410                 return;
  411 
  412         ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
  413         while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
  414             ki->kaio_buffer_finished_count)) {
  415                 ki->kaio_flags |= KAIO_RUNDOWN;
  416                 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
  417                         break;
  418         }
  419 
  420         /*
  421          * Move any aio ops that are waiting on socket I/O to the normal job
  422          * queues so they are cleaned up with any others.
  423          */
  424         s = splnet();
  425         for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
  426             aiocbn) {
  427                 aiocbn = TAILQ_NEXT(aiocbe, plist);
  428                 fp = aiocbe->fd_file;
  429                 if (fp != NULL) {
  430                         so = (struct socket *)fp->f_data;
  431                         TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
  432                         if (TAILQ_EMPTY(&so->so_aiojobq)) {
  433                                 so->so_snd.sb_flags &= ~SB_AIO;
  434                                 so->so_rcv.sb_flags &= ~SB_AIO;
  435                         }
  436                 }
  437                 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
  438                 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
  439                 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
  440         }
  441         splx(s);
  442 
  443 restart1:
  444         for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
  445                 aiocbn = TAILQ_NEXT(aiocbe, plist);
  446                 if (aio_free_entry(aiocbe))
  447                         goto restart1;
  448         }
  449 
  450 restart2:
  451         for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
  452             aiocbn) {
  453                 aiocbn = TAILQ_NEXT(aiocbe, plist);
  454                 if (aio_free_entry(aiocbe))
  455                         goto restart2;
  456         }
  457 
  458 /*
  459  * Note the use of lots of splbio here, trying to avoid splbio for long chains
  460  * of I/O.  Probably unnecessary.
  461  */
  462 restart3:
  463         s = splbio();
  464         while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
  465                 ki->kaio_flags |= KAIO_WAKEUP;
  466                 tsleep(p, PRIBIO, "aioprn", 0);
  467                 splx(s);
  468                 goto restart3;
  469         }
  470         splx(s);
  471 
  472 restart4:
  473         s = splbio();
  474         for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
  475                 aiocbn = TAILQ_NEXT(aiocbe, plist);
  476                 if (aio_free_entry(aiocbe)) {
  477                         splx(s);
  478                         goto restart4;
  479                 }
  480         }
  481         splx(s);
  482 
  483         /*
  484          * If we've slept, jobs might have moved from one queue to another.
  485          * Retry rundown if we didn't manage to empty the queues.
  486          */
  487         if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
  488             TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
  489             TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
  490             TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
  491                 goto restart1;
  492 
  493         for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
  494                 ljn = TAILQ_NEXT(lj, lioj_list);
  495                 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
  496                     0)) {
  497                         TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
  498                         zfree(aiolio_zone, lj);
  499                 } else {
  500 #ifdef DIAGNOSTIC
  501                         printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
  502                             "QF:%d\n", lj->lioj_buffer_count,
  503                             lj->lioj_buffer_finished_count,
  504                             lj->lioj_queue_count,
  505                             lj->lioj_queue_finished_count);
  506 #endif
  507                 }
  508         }
  509 
  510         zfree(kaio_zone, ki);
  511         p->p_aioinfo = NULL;
  512 #endif /* VFS_AIO */
  513 }
  514 
  515 #ifdef VFS_AIO
  516 /*
  517  * Select a job to run (called by an AIO daemon).
  518  */
  519 static struct aiocblist *
  520 aio_selectjob(struct aioproclist *aiop)
  521 {
  522         int s;
  523         struct aiocblist *aiocbe;
  524         struct kaioinfo *ki;
  525         struct proc *userp;
  526 
  527         s = splnet();
  528         for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
  529             TAILQ_NEXT(aiocbe, list)) {
  530                 userp = aiocbe->userproc;
  531                 ki = userp->p_aioinfo;
  532 
  533                 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
  534                         TAILQ_REMOVE(&aio_jobs, aiocbe, list);
  535                         splx(s);
  536                         return aiocbe;
  537                 }
  538         }
  539         splx(s);
  540 
  541         return NULL;
  542 }
  543 
  544 /*
  545  * The AIO processing activity.  This is the code that does the I/O request for
  546  * the non-physio version of the operations.  The normal vn operations are used,
  547  * and this code should work in all instances for every type of file, including
  548  * pipes, sockets, fifos, and regular files.
  549  */
  550 static void
  551 aio_process(struct aiocblist *aiocbe)
  552 {
  553         struct proc *mycp;
  554         struct aiocb *cb;
  555         struct file *fp;
  556         struct uio auio;
  557         struct iovec aiov;
  558         int cnt;
  559         int error;
  560         int oublock_st, oublock_end;
  561         int inblock_st, inblock_end;
  562 
  563         mycp = curproc;
  564         cb = &aiocbe->uaiocb;
  565         fp = aiocbe->fd_file;
  566 
  567         aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
  568         aiov.iov_len = cb->aio_nbytes;
  569 
  570         auio.uio_iov = &aiov;
  571         auio.uio_iovcnt = 1;
  572         auio.uio_offset = cb->aio_offset;
  573         auio.uio_resid = cb->aio_nbytes;
  574         cnt = cb->aio_nbytes;
  575         auio.uio_segflg = UIO_USERSPACE;
  576         auio.uio_procp = mycp;
  577 
  578         inblock_st = mycp->p_stats->p_ru.ru_inblock;
  579         oublock_st = mycp->p_stats->p_ru.ru_oublock;
  580         /*
  581          * _aio_aqueue() acquires a reference to the file that is
  582          * released in aio_free_entry().
  583          */
  584         if (cb->aio_lio_opcode == LIO_READ) {
  585                 auio.uio_rw = UIO_READ;
  586                 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
  587         } else {
  588                 auio.uio_rw = UIO_WRITE;
  589                 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
  590         }
  591         inblock_end = mycp->p_stats->p_ru.ru_inblock;
  592         oublock_end = mycp->p_stats->p_ru.ru_oublock;
  593 
  594         aiocbe->inputcharge = inblock_end - inblock_st;
  595         aiocbe->outputcharge = oublock_end - oublock_st;
  596 
  597         if ((error) && (auio.uio_resid != cnt)) {
  598                 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
  599                         error = 0;
  600                 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
  601                         psignal(aiocbe->userproc, SIGPIPE);
  602         }
  603 
  604         cnt -= auio.uio_resid;
  605         cb->_aiocb_private.error = error;
  606         cb->_aiocb_private.status = cnt;
  607 }
  608 
  609 /*
  610  * The AIO daemon, most of the actual work is done in aio_process,
  611  * but the setup (and address space mgmt) is done in this routine.
  612  */
  613 static void
  614 aio_daemon(void *uproc)
  615 {
  616         int s;
  617         struct aio_liojob *lj;
  618         struct aiocb *cb;
  619         struct aiocblist *aiocbe;
  620         struct aioproclist *aiop;
  621         struct kaioinfo *ki;
  622         struct proc *curcp, *mycp, *userp;
  623         struct vmspace *myvm, *tmpvm;
  624 
  625         /*
  626          * Local copies of curproc (cp) and vmspace (myvm)
  627          */
  628         mycp = curproc;
  629         myvm = mycp->p_vmspace;
  630 
  631         if (mycp->p_textvp) {
  632                 vrele(mycp->p_textvp);
  633                 mycp->p_textvp = NULL;
  634         }
  635 
  636         /*
  637          * Allocate and ready the aio control info.  There is one aiop structure
  638          * per daemon.
  639          */
  640         aiop = zalloc(aiop_zone);
  641         aiop->aioproc = mycp;
  642         aiop->aioprocflags |= AIOP_FREE;
  643 
  644         s = splnet();
  645 
  646         /*
  647          * Place thread (lightweight process) onto the AIO free thread list.
  648          */
  649         if (TAILQ_EMPTY(&aio_freeproc))
  650                 wakeup(&aio_freeproc);
  651         TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
  652 
  653         splx(s);
  654 
  655         /* Make up a name for the daemon. */
  656         strcpy(mycp->p_comm, "aiod");
  657 
  658         /*
  659          * Get rid of our current filedescriptors.  AIOD's don't need any
  660          * filedescriptors, except as temporarily inherited from the client.
  661          * Credentials are also cloned, and made equivalent to "root".
  662          */
  663         fdfree(mycp);
  664         mycp->p_fd = NULL;
  665         mycp->p_ucred = crcopy(mycp->p_ucred);
  666         mycp->p_ucred->cr_uid = 0;
  667         uifree(mycp->p_ucred->cr_uidinfo);
  668         mycp->p_ucred->cr_uidinfo = uifind(0);
  669         mycp->p_ucred->cr_ngroups = 1;
  670         mycp->p_ucred->cr_groups[0] = 1;
  671 
  672         /* The daemon resides in its own pgrp. */
  673         enterpgrp(mycp, mycp->p_pid, 1);
  674 
  675         /* Mark special process type. */
  676         mycp->p_flag |= P_SYSTEM | P_KTHREADP;
  677 
  678         /*
  679          * Wakeup parent process.  (Parent sleeps to keep from blasting away
  680          * and creating too many daemons.)
  681          */
  682         wakeup(mycp);
  683 
  684         for (;;) {
  685                 /*
  686                  * curcp is the current daemon process context.
  687                  * userp is the current user process context.
  688                  */
  689                 curcp = mycp;
  690 
  691                 /*
  692                  * Take daemon off of free queue
  693                  */
  694                 if (aiop->aioprocflags & AIOP_FREE) {
  695                         s = splnet();
  696                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
  697                         TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
  698                         aiop->aioprocflags &= ~AIOP_FREE;
  699                         splx(s);
  700                 }
  701                 aiop->aioprocflags &= ~AIOP_SCHED;
  702 
  703                 /*
  704                  * Check for jobs.
  705                  */
  706                 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
  707                         cb = &aiocbe->uaiocb;
  708                         userp = aiocbe->userproc;
  709 
  710                         aiocbe->jobstate = JOBST_JOBRUNNING;
  711 
  712                         /*
  713                          * Connect to process address space for user program.
  714                          */
  715                         if (userp != curcp) {
  716                                 /*
  717                                  * Save the current address space that we are
  718                                  * connected to.
  719                                  */
  720                                 tmpvm = mycp->p_vmspace;
  721                                 
  722                                 /*
  723                                  * Point to the new user address space, and
  724                                  * refer to it.
  725                                  */
  726                                 mycp->p_vmspace = userp->p_vmspace;
  727                                 mycp->p_vmspace->vm_refcnt++;
  728                                 
  729                                 /* Activate the new mapping. */
  730                                 pmap_activate(mycp);
  731                                 
  732                                 /*
  733                                  * If the old address space wasn't the daemons
  734                                  * own address space, then we need to remove the
  735                                  * daemon's reference from the other process
  736                                  * that it was acting on behalf of.
  737                                  */
  738                                 if (tmpvm != myvm) {
  739                                         vmspace_free(tmpvm);
  740                                 }
  741                                 curcp = userp;
  742                         }
  743 
  744                         ki = userp->p_aioinfo;
  745                         lj = aiocbe->lio;
  746 
  747                         /* Account for currently active jobs. */
  748                         ki->kaio_active_count++;
  749 
  750                         /* Do the I/O function. */
  751                         aio_process(aiocbe);
  752 
  753                         /* Decrement the active job count. */
  754                         ki->kaio_active_count--;
  755 
  756                         /*
  757                          * Increment the completion count for wakeup/signal
  758                          * comparisons.
  759                          */
  760                         aiocbe->jobflags |= AIOCBLIST_DONE;
  761                         ki->kaio_queue_finished_count++;
  762                         if (lj)
  763                                 lj->lioj_queue_finished_count++;
  764                         if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
  765                             & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
  766                                 ki->kaio_flags &= ~KAIO_WAKEUP;
  767                                 wakeup(userp);
  768                         }
  769 
  770                         s = splbio();
  771                         if (lj && (lj->lioj_flags &
  772                             (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
  773                                 if ((lj->lioj_queue_finished_count ==
  774                                     lj->lioj_queue_count) &&
  775                                     (lj->lioj_buffer_finished_count ==
  776                                     lj->lioj_buffer_count)) {
  777                                                 psignal(userp,
  778                                                     lj->lioj_signal.sigev_signo);
  779                                                 lj->lioj_flags |=
  780                                                     LIOJ_SIGNAL_POSTED;
  781                                 }
  782                         }
  783                         splx(s);
  784 
  785                         aiocbe->jobstate = JOBST_JOBFINISHED;
  786 
  787                         s = splnet();
  788                         TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
  789                         TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
  790                         splx(s);
  791                         KNOTE(&aiocbe->klist, 0);
  792 
  793                         if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
  794                                 wakeup(aiocbe);
  795                                 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
  796                         }
  797 
  798                         if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
  799                                 psignal(userp, cb->aio_sigevent.sigev_signo);
  800                         }
  801                 }
  802 
  803                 /*
  804                  * Disconnect from user address space.
  805                  */
  806                 if (curcp != mycp) {
  807                         /* Get the user address space to disconnect from. */
  808                         tmpvm = mycp->p_vmspace;
  809                         
  810                         /* Get original address space for daemon. */
  811                         mycp->p_vmspace = myvm;
  812                         
  813                         /* Activate the daemon's address space. */
  814                         pmap_activate(mycp);
  815 #ifdef DIAGNOSTIC
  816                         if (tmpvm == myvm) {
  817                                 printf("AIOD: vmspace problem -- %d\n",
  818                                     mycp->p_pid);
  819                         }
  820 #endif
  821                         /* Remove our vmspace reference. */
  822                         vmspace_free(tmpvm);
  823 
  824                         curcp = mycp;
  825                 }
  826 
  827                 /*
  828                  * If we are the first to be put onto the free queue, wakeup
  829                  * anyone waiting for a daemon.
  830                  */
  831                 s = splnet();
  832                 TAILQ_REMOVE(&aio_activeproc, aiop, list);
  833                 if (TAILQ_EMPTY(&aio_freeproc))
  834                         wakeup(&aio_freeproc);
  835                 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
  836                 aiop->aioprocflags |= AIOP_FREE;
  837                 splx(s);
  838 
  839                 /*
  840                  * If daemon is inactive for a long time, allow it to exit,
  841                  * thereby freeing resources.
  842                  */
  843                 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
  844                     PRIBIO, "aiordy", aiod_lifetime)) {
  845                         s = splnet();
  846                         if (TAILQ_EMPTY(&aio_jobs)) {
  847                                 if ((aiop->aioprocflags & AIOP_FREE) &&
  848                                     (num_aio_procs > target_aio_procs)) {
  849                                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
  850                                         splx(s);
  851                                         zfree(aiop_zone, aiop);
  852                                         num_aio_procs--;
  853 #ifdef DIAGNOSTIC
  854                                         if (mycp->p_vmspace->vm_refcnt <= 1) {
  855                                                 printf("AIOD: bad vm refcnt for"
  856                                                     " exiting daemon: %d\n",
  857                                                     mycp->p_vmspace->vm_refcnt);
  858                                         }
  859 #endif
  860                                         exit1(mycp, 0);
  861                                 }
  862                         }
  863                         splx(s);
  864                 }
  865         }
  866 }
  867 
  868 /*
  869  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
  870  * AIO daemon modifies its environment itself.
  871  */
  872 static int
  873 aio_newproc()
  874 {
  875         int error;
  876         struct proc *p, *np;
  877 
  878         p = &proc0;
  879         error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np);
  880         if (error)
  881                 return error;
  882         cpu_set_fork_handler(np, aio_daemon, curproc);
  883 
  884         /*
  885          * Wait until daemon is started, but continue on just in case to
  886          * handle error conditions.
  887          */
  888         error = tsleep(np, PZERO, "aiosta", aiod_timeout);
  889         num_aio_procs++;
  890 
  891         return error;
  892 }
  893 
  894 /*
  895  * Try the high-performance, low-overhead physio method for eligible
  896  * VCHR devices.  This method doesn't use an aio helper thread, and
  897  * thus has very low overhead. 
  898  *
  899  * Assumes that the caller, _aio_aqueue(), has incremented the file
  900  * structure's reference count, preventing its deallocation for the
  901  * duration of this call. 
  902  */
  903 static int
  904 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
  905 {
  906         int error;
  907         struct aiocb *cb;
  908         struct file *fp;
  909         struct buf *bp;
  910         struct vnode *vp;
  911         struct kaioinfo *ki;
  912         struct aio_liojob *lj;
  913         int s;
  914         int notify;
  915 
  916         cb = &aiocbe->uaiocb;
  917         fp = aiocbe->fd_file;
  918 
  919         if (fp->f_type != DTYPE_VNODE) 
  920                 return (-1);
  921 
  922         vp = (struct vnode *)fp->f_data;
  923 
  924         /*
  925          * If its not a disk, we don't want to return a positive error.
  926          * It causes the aio code to not fall through to try the thread
  927          * way when you're talking to a regular file.
  928          */
  929         if (!vn_isdisk(vp, &error)) {
  930                 if (error == ENOTBLK)
  931                         return (-1);
  932                 else
  933                         return (error);
  934         }
  935 
  936         if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
  937                 return (-1);
  938 
  939         if (cb->aio_nbytes >
  940             MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
  941                 return (-1);
  942 
  943         ki = p->p_aioinfo;
  944         if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 
  945                 return (-1);
  946 
  947         ki->kaio_buffer_count++;
  948 
  949         lj = aiocbe->lio;
  950         if (lj)
  951                 lj->lioj_buffer_count++;
  952 
  953         /* Create and build a buffer header for a transfer. */
  954         bp = (struct buf *)getpbuf(NULL);
  955         BUF_KERNPROC(bp);
  956 
  957         /*
  958          * Get a copy of the kva from the physical buffer.
  959          */
  960         bp->b_caller1 = p;
  961         bp->b_dev = vp->v_rdev;
  962         error = 0;
  963 
  964         bp->b_bcount = cb->aio_nbytes;
  965         bp->b_bufsize = cb->aio_nbytes;
  966         bp->b_flags = B_PHYS | B_CALL | (cb->aio_lio_opcode == LIO_WRITE ?
  967             B_WRITE : B_READ);
  968         bp->b_iodone = aio_physwakeup;
  969         bp->b_saveaddr = bp->b_data;
  970         bp->b_data = (void *)(uintptr_t)cb->aio_buf;
  971         bp->b_blkno = btodb(cb->aio_offset);
  972 
  973         /* Bring buffer into kernel space. */
  974         if (vmapbuf(bp) < 0) {
  975                 error = EFAULT;
  976                 goto doerror;
  977         }
  978 
  979         s = splbio();
  980         aiocbe->bp = bp;
  981         bp->b_spc = (void *)aiocbe;
  982         TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
  983         TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
  984         aiocbe->jobstate = JOBST_JOBQBUF;
  985         cb->_aiocb_private.status = cb->aio_nbytes;
  986         num_buf_aio++;
  987         bp->b_error = 0;
  988 
  989         splx(s);
  990         
  991         /* Perform transfer. */
  992         BUF_STRATEGY(bp, 0);
  993 
  994         notify = 0;
  995         s = splbio();
  996         
  997         /*
  998          * If we had an error invoking the request, or an error in processing
  999          * the request before we have returned, we process it as an error in
 1000          * transfer.  Note that such an I/O error is not indicated immediately,
 1001          * but is returned using the aio_error mechanism.  In this case,
 1002          * aio_suspend will return immediately.
 1003          */
 1004         if (bp->b_error || (bp->b_flags & B_ERROR)) {
 1005                 struct aiocb *job = aiocbe->uuaiocb;
 1006 
 1007                 aiocbe->uaiocb._aiocb_private.status = 0;
 1008                 suword(&job->_aiocb_private.status, 0);
 1009                 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
 1010                 suword(&job->_aiocb_private.error, bp->b_error);
 1011 
 1012                 ki->kaio_buffer_finished_count++;
 1013 
 1014                 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
 1015                         aiocbe->jobstate = JOBST_JOBBFINISHED;
 1016                         aiocbe->jobflags |= AIOCBLIST_DONE;
 1017                         TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
 1018                         TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
 1019                         TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
 1020                         notify = 1;
 1021                 }
 1022         }
 1023         splx(s);
 1024         if (notify)
 1025                 KNOTE(&aiocbe->klist, 0);
 1026         return 0;
 1027 
 1028 doerror:
 1029         ki->kaio_buffer_count--;
 1030         if (lj)
 1031                 lj->lioj_buffer_count--;
 1032         aiocbe->bp = NULL;
 1033         relpbuf(bp, NULL);
 1034         return error;
 1035 }
 1036 
 1037 /*
 1038  * This waits/tests physio completion.
 1039  */
 1040 static int
 1041 aio_fphysio(struct aiocblist *iocb)
 1042 {
 1043         int s;
 1044         struct buf *bp;
 1045         int error;
 1046 
 1047         bp = iocb->bp;
 1048 
 1049         s = splbio();
 1050         while ((bp->b_flags & B_DONE) == 0) {
 1051                 if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
 1052                         if ((bp->b_flags & B_DONE) == 0) {
 1053                                 splx(s);
 1054                                 return EINPROGRESS;
 1055                         } else
 1056                                 break;
 1057                 }
 1058         }
 1059         splx(s);
 1060 
 1061         /* Release mapping into kernel space. */
 1062         vunmapbuf(bp);
 1063         iocb->bp = 0;
 1064 
 1065         error = 0;
 1066         
 1067         /* Check for an error. */
 1068         if (bp->b_flags & B_ERROR)
 1069                 error = bp->b_error;
 1070 
 1071         relpbuf(bp, NULL);
 1072         return (error);
 1073 }
 1074 #endif /* VFS_AIO */
 1075 
 1076 /*
 1077  * Wake up aio requests that may be serviceable now.
 1078  */
 1079 void
 1080 aio_swake(struct socket *so, struct sockbuf *sb)
 1081 {
 1082 #ifndef VFS_AIO
 1083         return;
 1084 #else
 1085         struct aiocblist *cb,*cbn;
 1086         struct proc *p;
 1087         struct kaioinfo *ki = NULL;
 1088         int opcode, wakecount = 0;
 1089         struct aioproclist *aiop;
 1090 
 1091         if (sb == &so->so_snd) {
 1092                 opcode = LIO_WRITE;
 1093                 so->so_snd.sb_flags &= ~SB_AIO;
 1094         } else {
 1095                 opcode = LIO_READ;
 1096                 so->so_rcv.sb_flags &= ~SB_AIO;
 1097         }
 1098 
 1099         for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
 1100                 cbn = TAILQ_NEXT(cb, list);
 1101                 if (opcode == cb->uaiocb.aio_lio_opcode) {
 1102                         p = cb->userproc;
 1103                         ki = p->p_aioinfo;
 1104                         TAILQ_REMOVE(&so->so_aiojobq, cb, list);
 1105                         TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
 1106                         TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
 1107                         TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
 1108                         wakecount++;
 1109                         if (cb->jobstate != JOBST_JOBQGLOBAL)
 1110                                 panic("invalid queue value");
 1111                 }
 1112         }
 1113 
 1114         while (wakecount--) {
 1115                 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
 1116                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
 1117                         TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
 1118                         aiop->aioprocflags &= ~AIOP_FREE;
 1119                         wakeup(aiop->aioproc);
 1120                 }
 1121         }
 1122 #endif /* VFS_AIO */
 1123 }
 1124 
 1125 #ifdef VFS_AIO
 1126 /*
 1127  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
 1128  * technique is done in this code.
 1129  */
 1130 static int
 1131 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
 1132 {
 1133         struct filedesc *fdp;
 1134         struct file *fp;
 1135         unsigned int fd;
 1136         struct socket *so;
 1137         int s;
 1138         int error;
 1139         int opcode, user_opcode;
 1140         struct aiocblist *aiocbe;
 1141         struct aioproclist *aiop;
 1142         struct kaioinfo *ki;
 1143         struct kevent kev;
 1144         struct kqueue *kq;
 1145         struct file *kq_fp;
 1146 
 1147         if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
 1148                 TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
 1149         else
 1150                 aiocbe = zalloc (aiocb_zone);
 1151 
 1152         aiocbe->inputcharge = 0;
 1153         aiocbe->outputcharge = 0;
 1154         callout_handle_init(&aiocbe->timeouthandle);
 1155         SLIST_INIT(&aiocbe->klist);
 1156 
 1157         suword(&job->_aiocb_private.status, -1);
 1158         suword(&job->_aiocb_private.error, 0);
 1159         suword(&job->_aiocb_private.kernelinfo, -1);
 1160 
 1161         error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
 1162         if (error) {
 1163                 suword(&job->_aiocb_private.error, error);
 1164                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1165                 return error;
 1166         }
 1167         if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
 1168             !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
 1169                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1170                 return EINVAL;
 1171         }
 1172 
 1173         /* Save userspace address of the job info. */
 1174         aiocbe->uuaiocb = job;
 1175 
 1176         /* Get the opcode. */
 1177         user_opcode = aiocbe->uaiocb.aio_lio_opcode;
 1178         if (type != LIO_NOP)
 1179                 aiocbe->uaiocb.aio_lio_opcode = type;
 1180         opcode = aiocbe->uaiocb.aio_lio_opcode;
 1181 
 1182         /* Get the fd info for process. */
 1183         fdp = p->p_fd;
 1184 
 1185         /*
 1186          * Range check file descriptor.
 1187          */
 1188         fd = aiocbe->uaiocb.aio_fildes;
 1189         if (fd >= fdp->fd_nfiles) {
 1190                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1191                 if (type == 0)
 1192                         suword(&job->_aiocb_private.error, EBADF);
 1193                 return EBADF;
 1194         }
 1195 
 1196         fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
 1197         if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
 1198             0))) {
 1199                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1200                 if (type == 0)
 1201                         suword(&job->_aiocb_private.error, EBADF);
 1202                 return EBADF;
 1203         }
 1204         fhold(fp);
 1205 
 1206         if (aiocbe->uaiocb.aio_offset == -1LL) {
 1207                 error = EINVAL;
 1208                 goto aqueue_fail;
 1209         }
 1210         error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
 1211         if (error) {
 1212                 error = EINVAL;
 1213                 goto aqueue_fail;
 1214         }
 1215         aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
 1216         if (jobrefid == LONG_MAX)
 1217                 jobrefid = 1;
 1218         else
 1219                 jobrefid++;
 1220         
 1221         if (opcode == LIO_NOP) {
 1222                 fdrop(fp, p);
 1223                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1224                 if (type == 0) {
 1225                         suword(&job->_aiocb_private.error, 0);
 1226                         suword(&job->_aiocb_private.status, 0);
 1227                         suword(&job->_aiocb_private.kernelinfo, 0);
 1228                 }
 1229                 return 0;
 1230         }
 1231         if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
 1232                 if (type == 0)
 1233                         suword(&job->_aiocb_private.status, 0);
 1234                 error = EINVAL;
 1235                 goto aqueue_fail;
 1236         }
 1237 
 1238         if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
 1239                 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
 1240                 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
 1241         }
 1242         else {
 1243                 /*
 1244                  * This method for requesting kevent-based notification won't
 1245                  * work on the alpha, since we're passing in a pointer
 1246                  * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
 1247                  * based method instead.
 1248                  */
 1249                 if (user_opcode == LIO_NOP || user_opcode == LIO_READ ||
 1250                     user_opcode == LIO_WRITE)
 1251                         goto no_kqueue;
 1252 
 1253                 error = copyin((struct kevent *)(uintptr_t)user_opcode,
 1254                     &kev, sizeof(kev));
 1255                 if (error)
 1256                         goto aqueue_fail;
 1257         }
 1258         if ((u_int)kev.ident >= fdp->fd_nfiles ||
 1259             (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
 1260             (kq_fp->f_type != DTYPE_KQUEUE)) {
 1261                 error = EBADF;
 1262                 goto aqueue_fail;
 1263         }
 1264         kq = (struct kqueue *)kq_fp->f_data;
 1265         kev.ident = (uintptr_t)aiocbe->uuaiocb;
 1266         kev.filter = EVFILT_AIO;
 1267         kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 1268         kev.data = (intptr_t)aiocbe;
 1269         error = kqueue_register(kq, &kev, p);
 1270 aqueue_fail:
 1271         if (error) {
 1272                 fdrop(fp, p);
 1273                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1274                 if (type == 0)
 1275                         suword(&job->_aiocb_private.error, error);
 1276                 goto done;
 1277         }
 1278 no_kqueue:
 1279 
 1280         suword(&job->_aiocb_private.error, EINPROGRESS);
 1281         aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
 1282         aiocbe->userproc = p;
 1283         aiocbe->jobflags = 0;
 1284         aiocbe->lio = lj;
 1285         ki = p->p_aioinfo;
 1286 
 1287         if (fp->f_type == DTYPE_SOCKET) {
 1288                 /*
 1289                  * Alternate queueing for socket ops: Reach down into the
 1290                  * descriptor to get the socket data.  Then check to see if the
 1291                  * socket is ready to be read or written (based on the requested
 1292                  * operation).
 1293                  *
 1294                  * If it is not ready for io, then queue the aiocbe on the
 1295                  * socket, and set the flags so we get a call when sbnotify()
 1296                  * happens.
 1297                  */
 1298                 so = (struct socket *)fp->f_data;
 1299                 s = splnet();
 1300                 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
 1301                     LIO_WRITE) && (!sowriteable(so)))) {
 1302                         TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
 1303                         TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
 1304                         if (opcode == LIO_READ)
 1305                                 so->so_rcv.sb_flags |= SB_AIO;
 1306                         else
 1307                                 so->so_snd.sb_flags |= SB_AIO;
 1308                         aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
 1309                         ki->kaio_queue_count++;
 1310                         num_queue_count++;
 1311                         splx(s);
 1312                         error = 0;
 1313                         goto done;
 1314                 }
 1315                 splx(s);
 1316         }
 1317 
 1318         if ((error = aio_qphysio(p, aiocbe)) == 0)
 1319                 goto done;
 1320         if (error > 0) {
 1321                 suword(&job->_aiocb_private.status, 0);
 1322                 aiocbe->uaiocb._aiocb_private.error = error;
 1323                 suword(&job->_aiocb_private.error, error);
 1324                 goto done;
 1325         }
 1326 
 1327         /* No buffer for daemon I/O. */
 1328         aiocbe->bp = NULL;
 1329 
 1330         ki->kaio_queue_count++;
 1331         if (lj)
 1332                 lj->lioj_queue_count++;
 1333         s = splnet();
 1334         TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
 1335         TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
 1336         splx(s);
 1337         aiocbe->jobstate = JOBST_JOBQGLOBAL;
 1338 
 1339         num_queue_count++;
 1340         error = 0;
 1341 
 1342         /*
 1343          * If we don't have a free AIO process, and we are below our quota, then
 1344          * start one.  Otherwise, depend on the subsequent I/O completions to
 1345          * pick-up this job.  If we don't sucessfully create the new process
 1346          * (thread) due to resource issues, we return an error for now (EAGAIN),
 1347          * which is likely not the correct thing to do.
 1348          */
 1349         s = splnet();
 1350 retryproc:
 1351         if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 1352                 TAILQ_REMOVE(&aio_freeproc, aiop, list);
 1353                 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
 1354                 aiop->aioprocflags &= ~AIOP_FREE;
 1355                 wakeup(aiop->aioproc);
 1356         } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
 1357             ((ki->kaio_active_count + num_aio_resv_start) <
 1358             ki->kaio_maxactive_count)) {
 1359                 num_aio_resv_start++;
 1360                 if ((error = aio_newproc()) == 0) {
 1361                         num_aio_resv_start--;
 1362                         goto retryproc;
 1363                 }
 1364                 num_aio_resv_start--;
 1365         }
 1366         splx(s);
 1367 done:
 1368         return error;
 1369 }
 1370 
 1371 /*
 1372  * This routine queues an AIO request, checking for quotas.
 1373  */
 1374 static int
 1375 aio_aqueue(struct proc *p, struct aiocb *job, int type)
 1376 {
 1377         struct kaioinfo *ki;
 1378 
 1379         if (p->p_aioinfo == NULL)
 1380                 aio_init_aioinfo(p);
 1381 
 1382         if (num_queue_count >= max_queue_count)
 1383                 return EAGAIN;
 1384 
 1385         ki = p->p_aioinfo;
 1386         if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
 1387                 return EAGAIN;
 1388 
 1389         return _aio_aqueue(p, job, NULL, type);
 1390 }
 1391 #endif /* VFS_AIO */
 1392 
 1393 /*
 1394  * Support the aio_return system call, as a side-effect, kernel resources are
 1395  * released.
 1396  */
 1397 int
 1398 aio_return(struct proc *p, struct aio_return_args *uap)
 1399 {
 1400 #ifndef VFS_AIO
 1401         return ENOSYS;
 1402 #else
 1403         int s;
 1404         long jobref;
 1405         struct aiocblist *cb, *ncb;
 1406         struct aiocb *ujob;
 1407         struct kaioinfo *ki;
 1408 
 1409         ki = p->p_aioinfo;
 1410         if (ki == NULL)
 1411                 return EINVAL;
 1412 
 1413         ujob = uap->aiocbp;
 1414 
 1415         jobref = fuword(&ujob->_aiocb_private.kernelinfo);
 1416         if (jobref == -1 || jobref == 0)
 1417                 return EINVAL;
 1418 
 1419         TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
 1420                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
 1421                     jobref) {
 1422                         if (ujob == cb->uuaiocb) {
 1423                                 p->p_retval[0] =
 1424                                     cb->uaiocb._aiocb_private.status;
 1425                         } else
 1426                                 p->p_retval[0] = EFAULT;
 1427                         if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 1428                                 p->p_stats->p_ru.ru_oublock +=
 1429                                     cb->outputcharge;
 1430                                 cb->outputcharge = 0;
 1431                         } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 1432                                 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
 1433                                 cb->inputcharge = 0;
 1434                         }
 1435                         aio_free_entry(cb);
 1436                         return 0;
 1437                 }
 1438         }
 1439         s = splbio();
 1440         for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
 1441                 ncb = TAILQ_NEXT(cb, plist);
 1442                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
 1443                     == jobref) {
 1444                         splx(s);
 1445                         if (ujob == cb->uuaiocb) {
 1446                                 p->p_retval[0] =
 1447                                     cb->uaiocb._aiocb_private.status;
 1448                         } else
 1449                                 p->p_retval[0] = EFAULT;
 1450                         aio_free_entry(cb);
 1451                         return 0;
 1452                 }
 1453         }
 1454         splx(s);
 1455 
 1456         return (EINVAL);
 1457 #endif /* VFS_AIO */
 1458 }
 1459 
 1460 /*
 1461  * Allow a process to wakeup when any of the I/O requests are completed.
 1462  */
 1463 int
 1464 aio_suspend(struct proc *p, struct aio_suspend_args *uap)
 1465 {
 1466 #ifndef VFS_AIO
 1467         return ENOSYS;
 1468 #else
 1469         struct timeval atv;
 1470         struct timespec ts;
 1471         struct aiocb *const *cbptr, *cbp;
 1472         struct kaioinfo *ki;
 1473         struct aiocblist *cb;
 1474         int i;
 1475         int njoblist;
 1476         int error, s, timo;
 1477         long *ijoblist;
 1478         struct aiocb **ujoblist;
 1479         
 1480         if (uap->nent > AIO_LISTIO_MAX)
 1481                 return EINVAL;
 1482 
 1483         timo = 0;
 1484         if (uap->timeout) {
 1485                 /* Get timespec struct. */
 1486                 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 1487                         return error;
 1488 
 1489                 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
 1490                         return (EINVAL);
 1491 
 1492                 TIMESPEC_TO_TIMEVAL(&atv, &ts);
 1493                 if (itimerfix(&atv))
 1494                         return (EINVAL);
 1495                 timo = tvtohz(&atv);
 1496         }
 1497 
 1498         ki = p->p_aioinfo;
 1499         if (ki == NULL)
 1500                 return EAGAIN;
 1501 
 1502         njoblist = 0;
 1503         ijoblist = zalloc(aiol_zone);
 1504         ujoblist = zalloc(aiol_zone);
 1505         cbptr = uap->aiocbp;
 1506 
 1507         for (i = 0; i < uap->nent; i++) {
 1508                 cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
 1509                 if (cbp == 0)
 1510                         continue;
 1511                 ujoblist[njoblist] = cbp;
 1512                 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
 1513                 njoblist++;
 1514         }
 1515 
 1516         if (njoblist == 0) {
 1517                 zfree(aiol_zone, ijoblist);
 1518                 zfree(aiol_zone, ujoblist);
 1519                 return 0;
 1520         }
 1521 
 1522         error = 0;
 1523         for (;;) {
 1524                 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
 1525                         for (i = 0; i < njoblist; i++) {
 1526                                 if (((intptr_t)
 1527                                     cb->uaiocb._aiocb_private.kernelinfo) ==
 1528                                     ijoblist[i]) {
 1529                                         if (ujoblist[i] != cb->uuaiocb)
 1530                                                 error = EINVAL;
 1531                                         zfree(aiol_zone, ijoblist);
 1532                                         zfree(aiol_zone, ujoblist);
 1533                                         return error;
 1534                                 }
 1535                         }
 1536                 }
 1537 
 1538                 s = splbio();
 1539                 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
 1540                     TAILQ_NEXT(cb, plist)) {
 1541                         for (i = 0; i < njoblist; i++) {
 1542                                 if (((intptr_t)
 1543                                     cb->uaiocb._aiocb_private.kernelinfo) ==
 1544                                     ijoblist[i]) {
 1545                                         splx(s);
 1546                                         if (ujoblist[i] != cb->uuaiocb)
 1547                                                 error = EINVAL;
 1548                                         zfree(aiol_zone, ijoblist);
 1549                                         zfree(aiol_zone, ujoblist);
 1550                                         return error;
 1551                                 }
 1552                         }
 1553                 }
 1554 
 1555                 ki->kaio_flags |= KAIO_WAKEUP;
 1556                 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
 1557                 splx(s);
 1558 
 1559                 if (error == ERESTART || error == EINTR) {
 1560                         zfree(aiol_zone, ijoblist);
 1561                         zfree(aiol_zone, ujoblist);
 1562                         return EINTR;
 1563                 } else if (error == EWOULDBLOCK) {
 1564                         zfree(aiol_zone, ijoblist);
 1565                         zfree(aiol_zone, ujoblist);
 1566                         return EAGAIN;
 1567                 }
 1568         }
 1569 
 1570 /* NOTREACHED */
 1571         return EINVAL;
 1572 #endif /* VFS_AIO */
 1573 }
 1574 
 1575 /*
 1576  * aio_cancel cancels any non-physio aio operations not currently in
 1577  * progress.
 1578  */
 1579 int
 1580 aio_cancel(struct proc *p, struct aio_cancel_args *uap)
 1581 {
 1582 #ifndef VFS_AIO
 1583         return ENOSYS;
 1584 #else
 1585         struct kaioinfo *ki;
 1586         struct aiocblist *cbe, *cbn;
 1587         struct file *fp;
 1588         struct filedesc *fdp;
 1589         struct socket *so;
 1590         struct proc *po;
 1591         int s,error;
 1592         int cancelled=0;
 1593         int notcancelled=0;
 1594         struct vnode *vp;
 1595 
 1596         fdp = p->p_fd;
 1597         if ((u_int)uap->fd >= fdp->fd_nfiles ||
 1598             (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 1599                 return (EBADF);
 1600 
 1601         if (fp->f_type == DTYPE_VNODE) {
 1602                 vp = (struct vnode *)fp->f_data;
 1603                 
 1604                 if (vn_isdisk(vp,&error)) {
 1605                         p->p_retval[0] = AIO_NOTCANCELED;
 1606                         return 0;
 1607                 }
 1608         } else if (fp->f_type == DTYPE_SOCKET) {
 1609                 so = (struct socket *)fp->f_data;
 1610 
 1611                 s = splnet();
 1612 
 1613                 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
 1614                         cbn = TAILQ_NEXT(cbe, list);
 1615                         if ((uap->aiocbp == NULL) ||
 1616                                 (uap->aiocbp == cbe->uuaiocb) ) {
 1617                                 po = cbe->userproc;
 1618                                 ki = po->p_aioinfo;
 1619                                 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
 1620                                 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
 1621                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
 1622                                 if (ki->kaio_flags & KAIO_WAKEUP) {
 1623                                         wakeup(po);
 1624                                 }
 1625                                 cbe->jobstate = JOBST_JOBFINISHED;
 1626                                 cbe->uaiocb._aiocb_private.status=-1;
 1627                                 cbe->uaiocb._aiocb_private.error=ECANCELED;
 1628                                 cancelled++;
 1629 /* XXX cancelled, knote? */
 1630                                 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
 1631                                     SIGEV_SIGNAL)
 1632                                         psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
 1633                                 if (uap->aiocbp) 
 1634                                         break;
 1635                         }
 1636                 }
 1637                 splx(s);
 1638 
 1639                 if ((cancelled) && (uap->aiocbp)) {
 1640                         p->p_retval[0] = AIO_CANCELED;
 1641                         return 0;
 1642                 }
 1643         }
 1644         ki=p->p_aioinfo;
 1645         if (ki == NULL)
 1646                 goto done;
 1647         s = splnet();
 1648 
 1649         for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
 1650                 cbn = TAILQ_NEXT(cbe, plist);
 1651 
 1652                 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
 1653                     ((uap->aiocbp == NULL ) || 
 1654                      (uap->aiocbp == cbe->uuaiocb))) {
 1655                         
 1656                         if (cbe->jobstate == JOBST_JOBQGLOBAL) {
 1657                                 TAILQ_REMOVE(&aio_jobs, cbe, list);
 1658                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
 1659                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
 1660                                     plist);
 1661                                 cancelled++;
 1662                                 ki->kaio_queue_finished_count++;
 1663                                 cbe->jobstate = JOBST_JOBFINISHED;
 1664                                 cbe->uaiocb._aiocb_private.status = -1;
 1665                                 cbe->uaiocb._aiocb_private.error = ECANCELED;
 1666 /* XXX cancelled, knote? */
 1667                                 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
 1668                                     SIGEV_SIGNAL)
 1669                                         psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
 1670                         } else {
 1671                                 notcancelled++;
 1672                         }
 1673                 }
 1674         }
 1675         splx(s);
 1676 done:
 1677         if (notcancelled) {
 1678                 p->p_retval[0] = AIO_NOTCANCELED;
 1679                 return 0;
 1680         }
 1681         if (cancelled) {
 1682                 p->p_retval[0] = AIO_CANCELED;
 1683                 return 0;
 1684         }
 1685         p->p_retval[0] = AIO_ALLDONE;
 1686 
 1687         return 0;
 1688 #endif /* VFS_AIO */
 1689 }
 1690 
 1691 /*
 1692  * aio_error is implemented in the kernel level for compatibility purposes only.
 1693  * For a user mode async implementation, it would be best to do it in a userland
 1694  * subroutine.
 1695  */
 1696 int
 1697 aio_error(struct proc *p, struct aio_error_args *uap)
 1698 {
 1699 #ifndef VFS_AIO
 1700         return ENOSYS;
 1701 #else
 1702         int s;
 1703         struct aiocblist *cb;
 1704         struct kaioinfo *ki;
 1705         long jobref;
 1706 
 1707         ki = p->p_aioinfo;
 1708         if (ki == NULL)
 1709                 return EINVAL;
 1710 
 1711         jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
 1712         if ((jobref == -1) || (jobref == 0))
 1713                 return EINVAL;
 1714 
 1715         TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
 1716                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
 1717                     jobref) {
 1718                         p->p_retval[0] = cb->uaiocb._aiocb_private.error;
 1719                         return 0;
 1720                 }
 1721         }
 1722 
 1723         s = splnet();
 1724 
 1725         for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
 1726             plist)) {
 1727                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
 1728                     jobref) {
 1729                         p->p_retval[0] = EINPROGRESS;
 1730                         splx(s);
 1731                         return 0;
 1732                 }
 1733         }
 1734 
 1735         for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
 1736             plist)) {
 1737                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
 1738                     jobref) {
 1739                         p->p_retval[0] = EINPROGRESS;
 1740                         splx(s);
 1741                         return 0;
 1742                 }
 1743         }
 1744         splx(s);
 1745 
 1746         s = splbio();
 1747         for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
 1748             plist)) {
 1749                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
 1750                     jobref) {
 1751                         p->p_retval[0] = cb->uaiocb._aiocb_private.error;
 1752                         splx(s);
 1753                         return 0;
 1754                 }
 1755         }
 1756 
 1757         for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
 1758             plist)) {
 1759                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
 1760                     jobref) {
 1761                         p->p_retval[0] = EINPROGRESS;
 1762                         splx(s);
 1763                         return 0;
 1764                 }
 1765         }
 1766         splx(s);
 1767 
 1768 #if (0)
 1769         /*
 1770          * Hack for lio.
 1771          */
 1772         status = fuword(&uap->aiocbp->_aiocb_private.status);
 1773         if (status == -1)
 1774                 return fuword(&uap->aiocbp->_aiocb_private.error);
 1775 #endif
 1776         return EINVAL;
 1777 #endif /* VFS_AIO */
 1778 }
 1779 
 1780 /* syscall - asynchronous read from a file (REALTIME) */
 1781 int
 1782 aio_read(struct proc *p, struct aio_read_args *uap)
 1783 {
 1784 #ifndef VFS_AIO
 1785         return ENOSYS;
 1786 #else
 1787         return aio_aqueue(p, uap->aiocbp, LIO_READ);
 1788 #endif /* VFS_AIO */
 1789 }
 1790 
 1791 /* syscall - asynchronous write to a file (REALTIME) */
 1792 int
 1793 aio_write(struct proc *p, struct aio_write_args *uap)
 1794 {
 1795 #ifndef VFS_AIO
 1796         return ENOSYS;
 1797 #else
 1798         return aio_aqueue(p, uap->aiocbp, LIO_WRITE);
 1799 #endif /* VFS_AIO */
 1800 }
 1801 
 1802 /* syscall - XXX undocumented */
 1803 int
 1804 lio_listio(struct proc *p, struct lio_listio_args *uap)
 1805 {
 1806 #ifndef VFS_AIO
 1807         return ENOSYS;
 1808 #else
 1809         int nent, nentqueued;
 1810         struct aiocb *iocb, * const *cbptr;
 1811         struct aiocblist *cb;
 1812         struct kaioinfo *ki;
 1813         struct aio_liojob *lj;
 1814         int error, runningcode;
 1815         int nerror;
 1816         int i;
 1817         int s;
 1818 
 1819         if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 1820                 return EINVAL;
 1821 
 1822         nent = uap->nent;
 1823         if (nent > AIO_LISTIO_MAX)
 1824                 return EINVAL;
 1825 
 1826         if (p->p_aioinfo == NULL)
 1827                 aio_init_aioinfo(p);
 1828 
 1829         if ((nent + num_queue_count) > max_queue_count)
 1830                 return EAGAIN;
 1831 
 1832         ki = p->p_aioinfo;
 1833         if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
 1834                 return EAGAIN;
 1835 
 1836         lj = zalloc(aiolio_zone);
 1837         if (!lj)
 1838                 return EAGAIN;
 1839 
 1840         lj->lioj_flags = 0;
 1841         lj->lioj_buffer_count = 0;
 1842         lj->lioj_buffer_finished_count = 0;
 1843         lj->lioj_queue_count = 0;
 1844         lj->lioj_queue_finished_count = 0;
 1845         lj->lioj_ki = ki;
 1846 
 1847         /*
 1848          * Setup signal.
 1849          */
 1850         if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 1851                 error = copyin(uap->sig, &lj->lioj_signal,
 1852                     sizeof(lj->lioj_signal));
 1853                 if (error) {
 1854                         zfree(aiolio_zone, lj);
 1855                         return error;
 1856                 }
 1857                 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 1858                         zfree(aiolio_zone, lj);
 1859                         return EINVAL;
 1860                 }
 1861                 lj->lioj_flags |= LIOJ_SIGNAL;
 1862                 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
 1863         } else
 1864                 lj->lioj_flags &= ~LIOJ_SIGNAL;
 1865 
 1866         TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 1867         /*
 1868          * Get pointers to the list of I/O requests.
 1869          */
 1870         nerror = 0;
 1871         nentqueued = 0;
 1872         cbptr = uap->acb_list;
 1873         for (i = 0; i < uap->nent; i++) {
 1874                 iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
 1875                 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
 1876                         error = _aio_aqueue(p, iocb, lj, 0);
 1877                         if (error == 0)
 1878                                 nentqueued++;
 1879                         else
 1880                                 nerror++;
 1881                 }
 1882         }
 1883 
 1884         /*
 1885          * If we haven't queued any, then just return error.
 1886          */
 1887         if (nentqueued == 0)
 1888                 return 0;
 1889 
 1890         /*
 1891          * Calculate the appropriate error return.
 1892          */
 1893         runningcode = 0;
 1894         if (nerror)
 1895                 runningcode = EIO;
 1896 
 1897         if (uap->mode == LIO_WAIT) {
 1898                 int command, found, jobref;
 1899                 
 1900                 for (;;) {
 1901                         found = 0;
 1902                         for (i = 0; i < uap->nent; i++) {
 1903                                 /*
 1904                                  * Fetch address of the control buf pointer in
 1905                                  * user space.
 1906                                  */
 1907                                 iocb = (struct aiocb *)
 1908                                     (intptr_t)fuword(&cbptr[i]);
 1909                                 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
 1910                                     == 0))
 1911                                         continue;
 1912 
 1913                                 /*
 1914                                  * Fetch the associated command from user space.
 1915                                  */
 1916                                 command = fuword(&iocb->aio_lio_opcode);
 1917                                 if (command == LIO_NOP) {
 1918                                         found++;
 1919                                         continue;
 1920                                 }
 1921 
 1922                                 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
 1923 
 1924                                 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
 1925                                         if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
 1926                                             == jobref) {
 1927                                                 if (cb->uaiocb.aio_lio_opcode
 1928                                                     == LIO_WRITE) {
 1929                                                         p->p_stats->p_ru.ru_oublock
 1930                                                             +=
 1931                                                             cb->outputcharge;
 1932                                                         cb->outputcharge = 0;
 1933                                                 } else if (cb->uaiocb.aio_lio_opcode
 1934                                                     == LIO_READ) {
 1935                                                         p->p_stats->p_ru.ru_inblock
 1936                                                             += cb->inputcharge;
 1937                                                         cb->inputcharge = 0;
 1938                                                 }
 1939                                                 found++;
 1940                                                 break;
 1941                                         }
 1942                                 }
 1943 
 1944                                 s = splbio();
 1945                                 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
 1946                                         if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
 1947                                             == jobref) {
 1948                                                 found++;
 1949                                                 break;
 1950                                         }
 1951                                 }
 1952                                 splx(s);
 1953                         }
 1954 
 1955                         /*
 1956                          * If all I/Os have been disposed of, then we can
 1957                          * return.
 1958                          */
 1959                         if (found == nentqueued)
 1960                                 return runningcode;
 1961                         
 1962                         ki->kaio_flags |= KAIO_WAKEUP;
 1963                         error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
 1964 
 1965                         if (error == EINTR)
 1966                                 return EINTR;
 1967                         else if (error == EWOULDBLOCK)
 1968                                 return EAGAIN;
 1969                 }
 1970         }
 1971 
 1972         return runningcode;
 1973 #endif /* VFS_AIO */
 1974 }
 1975 
 1976 #ifdef VFS_AIO
 1977 /*
 1978  * This is a weird hack so that we can post a signal.  It is safe to do so from
 1979  * a timeout routine, but *not* from an interrupt routine.
 1980  */
 1981 static void
 1982 process_signal(void *aioj)
 1983 {
 1984         struct aiocblist *aiocbe = aioj;
 1985         struct aio_liojob *lj = aiocbe->lio;
 1986         struct aiocb *cb = &aiocbe->uaiocb;
 1987 
 1988         if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
 1989             (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
 1990                 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
 1991                 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 1992         }
 1993 
 1994         if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
 1995                 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
 1996 }
 1997 
 1998 /*
 1999  * Interrupt handler for physio, performs the necessary process wakeups, and
 2000  * signals.
 2001  */
 2002 static void
 2003 aio_physwakeup(struct buf *bp)
 2004 {
 2005         struct aiocblist *aiocbe;
 2006         struct proc *p;
 2007         struct kaioinfo *ki;
 2008         struct aio_liojob *lj;
 2009 
 2010         wakeup(bp);
 2011 
 2012         aiocbe = (struct aiocblist *)bp->b_spc;
 2013         if (aiocbe) {
 2014                 p = bp->b_caller1;
 2015 
 2016                 aiocbe->jobstate = JOBST_JOBBFINISHED;
 2017                 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
 2018                 aiocbe->uaiocb._aiocb_private.error = 0;
 2019                 aiocbe->jobflags |= AIOCBLIST_DONE;
 2020 
 2021                 if (bp->b_flags & B_ERROR)
 2022                         aiocbe->uaiocb._aiocb_private.error = bp->b_error;
 2023 
 2024                 lj = aiocbe->lio;
 2025                 if (lj) {
 2026                         lj->lioj_buffer_finished_count++;
 2027                         
 2028                         /*
 2029                          * wakeup/signal if all of the interrupt jobs are done.
 2030                          */
 2031                         if (lj->lioj_buffer_finished_count ==
 2032                             lj->lioj_buffer_count) {
 2033                                 /*
 2034                                  * Post a signal if it is called for.
 2035                                  */
 2036                                 if ((lj->lioj_flags &
 2037                                     (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
 2038                                     LIOJ_SIGNAL) {
 2039                                         lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 2040                                         aiocbe->timeouthandle =
 2041                                                 timeout(process_signal,
 2042                                                         aiocbe, 0);
 2043                                 }
 2044                         }
 2045                 }
 2046 
 2047                 ki = p->p_aioinfo;
 2048                 if (ki) {
 2049                         ki->kaio_buffer_finished_count++;
 2050                         TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
 2051                         TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
 2052                         TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
 2053 
 2054                         KNOTE(&aiocbe->klist, 0);
 2055                         /* Do the wakeup. */
 2056                         if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
 2057                                 ki->kaio_flags &= ~KAIO_WAKEUP;
 2058                                 wakeup(p);
 2059                         }
 2060                 }
 2061 
 2062                 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
 2063                         aiocbe->timeouthandle =
 2064                                 timeout(process_signal, aiocbe, 0);
 2065         }
 2066 }
 2067 #endif /* VFS_AIO */
 2068 
 2069 /* syscall - wait for the next completion of an aio request */
 2070 int
 2071 aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap)
 2072 {
 2073 #ifndef VFS_AIO
 2074         return ENOSYS;
 2075 #else
 2076         struct timeval atv;
 2077         struct timespec ts;
 2078         struct kaioinfo *ki;
 2079         struct aiocblist *cb = NULL;
 2080         int error, s, timo;
 2081         
 2082         suword(uap->aiocbp, (int)NULL);
 2083 
 2084         timo = 0;
 2085         if (uap->timeout) {
 2086                 /* Get timespec struct. */
 2087                 error = copyin(uap->timeout, &ts, sizeof(ts));
 2088                 if (error)
 2089                         return error;
 2090 
 2091                 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
 2092                         return (EINVAL);
 2093 
 2094                 TIMESPEC_TO_TIMEVAL(&atv, &ts);
 2095                 if (itimerfix(&atv))
 2096                         return (EINVAL);
 2097                 timo = tvtohz(&atv);
 2098         }
 2099 
 2100         ki = p->p_aioinfo;
 2101         if (ki == NULL)
 2102                 return EAGAIN;
 2103 
 2104         for (;;) {
 2105                 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
 2106                         suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
 2107                         p->p_retval[0] = cb->uaiocb._aiocb_private.status;
 2108                         if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 2109                                 p->p_stats->p_ru.ru_oublock +=
 2110                                     cb->outputcharge;
 2111                                 cb->outputcharge = 0;
 2112                         } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 2113                                 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
 2114                                 cb->inputcharge = 0;
 2115                         }
 2116                         aio_free_entry(cb);
 2117                         return cb->uaiocb._aiocb_private.error;
 2118                 }
 2119 
 2120                 s = splbio();
 2121                 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
 2122                         splx(s);
 2123                         suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
 2124                         p->p_retval[0] = cb->uaiocb._aiocb_private.status;
 2125                         aio_free_entry(cb);
 2126                         return cb->uaiocb._aiocb_private.error;
 2127                 }
 2128 
 2129                 ki->kaio_flags |= KAIO_WAKEUP;
 2130                 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
 2131                 splx(s);
 2132 
 2133                 if (error == ERESTART)
 2134                         return EINTR;
 2135                 else if (error < 0)
 2136                         return error;
 2137                 else if (error == EINTR)
 2138                         return EINTR;
 2139                 else if (error == EWOULDBLOCK)
 2140                         return EAGAIN;
 2141         }
 2142 #endif /* VFS_AIO */
 2143 }
 2144 
 2145 #ifndef VFS_AIO
 2146 static int
 2147 filt_aioattach(struct knote *kn)
 2148 {
 2149 
 2150         return (ENXIO);
 2151 }
 2152 
 2153 struct filterops aio_filtops =
 2154         { 0, filt_aioattach, NULL, NULL };
 2155 
 2156 #else
 2157 /* kqueue attach function */
 2158 static int
 2159 filt_aioattach(struct knote *kn)
 2160 {
 2161         struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 2162 
 2163         /*
 2164          * The aiocbe pointer must be validated before using it, so
 2165          * registration is restricted to the kernel; the user cannot
 2166          * set EV_FLAG1.
 2167          */
 2168         if ((kn->kn_flags & EV_FLAG1) == 0)
 2169                 return (EPERM);
 2170         kn->kn_flags &= ~EV_FLAG1;
 2171 
 2172         SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
 2173 
 2174         return (0);
 2175 }
 2176 
 2177 /* kqueue detach function */
 2178 static void
 2179 filt_aiodetach(struct knote *kn)
 2180 {
 2181         struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 2182 
 2183         SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
 2184 }
 2185 
 2186 /* kqueue filter function */
 2187 /*ARGSUSED*/
 2188 static int
 2189 filt_aio(struct knote *kn, long hint)
 2190 {
 2191         struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 2192 
 2193         kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
 2194         if (aiocbe->jobstate != JOBST_JOBFINISHED &&
 2195             aiocbe->jobstate != JOBST_JOBBFINISHED)
 2196                 return (0);
 2197         kn->kn_flags |= EV_EOF; 
 2198         return (1);
 2199 }
 2200 
 2201 struct filterops aio_filtops =
 2202         { 0, filt_aioattach, filt_aiodetach, filt_aio };
 2203 #endif /* VFS_AIO */

Cache object: 6b3b1bf6dbb349e9d2d7d2cb67da09dc


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.