The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_aio.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
    3  *
    4  * Redistribution and use in source and binary forms, with or without
    5  * modification, are permitted provided that the following conditions
    6  * are met:
    7  * 1. Redistributions of source code must retain the above copyright
    8  *    notice, this list of conditions and the following disclaimer.
    9  * 2. John S. Dyson's name may not be used to endorse or promote products
   10  *    derived from this software without specific prior written permission.
   11  *
   12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
   13  * bad that happens because of using this software isn't the responsibility
   14  * of the author.  This software is distributed AS-IS.
   15  *
   16  * $FreeBSD$
   17  */
   18 
   19 /*
   20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
   21  */
   22 
   23 #include <sys/param.h>
   24 #include <sys/systm.h>
   25 #include <sys/sysproto.h>
   26 #include <sys/filedesc.h>
   27 #include <sys/kernel.h>
   28 #include <sys/fcntl.h>
   29 #include <sys/file.h>
   30 #include <sys/lock.h>
   31 #include <sys/unistd.h>
   32 #include <sys/proc.h>
   33 #include <sys/resourcevar.h>
   34 #include <sys/signalvar.h>
   35 #include <sys/sysctl.h>
   36 #include <sys/vnode.h>
   37 #include <sys/conf.h>
   38 #include <miscfs/specfs/specdev.h>
   39 
   40 #include <vm/vm.h>
   41 #include <vm/vm_param.h>
   42 #include <vm/vm_extern.h>
   43 #include <vm/pmap.h>
   44 #include <vm/vm_map.h>
   45 #include <vm/vm_zone.h>
   46 #include <sys/aio.h>
   47 #include <sys/shm.h>
   48 
   49 #include <machine/cpu.h>
   50 #include <machine/limits.h>
   51 
   52 static  long jobrefid;
   53 
   54 #define JOBST_NULL                      0x0
   55 #define JOBST_JOBQPROC          0x1
   56 #define JOBST_JOBQGLOBAL        0x2
   57 #define JOBST_JOBRUNNING        0x3
   58 #define JOBST_JOBFINISHED       0x4
   59 #define JOBST_JOBQBUF           0x5
   60 #define JOBST_JOBBFINISHED      0x6
   61 
   62 #ifndef MAX_AIO_PER_PROC
   63 #define MAX_AIO_PER_PROC        32
   64 #endif
   65 
   66 #ifndef MAX_AIO_QUEUE_PER_PROC
   67 #define MAX_AIO_QUEUE_PER_PROC  256 /* Bigger than AIO_LISTIO_MAX */
   68 #endif
   69 
   70 #ifndef MAX_AIO_PROCS
   71 #define MAX_AIO_PROCS           32
   72 #endif
   73 
   74 #ifndef MAX_AIO_QUEUE
   75 #define MAX_AIO_QUEUE           1024 /* Bigger than AIO_LISTIO_MAX */
   76 #endif
   77 
   78 #ifndef TARGET_AIO_PROCS
   79 #define TARGET_AIO_PROCS        0
   80 #endif
   81 
   82 #ifndef MAX_BUF_AIO
   83 #define MAX_BUF_AIO 16
   84 #endif
   85 
   86 #ifndef AIOD_TIMEOUT_DEFAULT
   87 #define AIOD_TIMEOUT_DEFAULT (10 * hz)
   88 #endif
   89 
   90 #ifndef AIOD_LIFETIME_DEFAULT
   91 #define AIOD_LIFETIME_DEFAULT (30 * hz)
   92 #endif
   93 
   94 static int max_aio_procs = MAX_AIO_PROCS;
   95 static int num_aio_procs = 0;
   96 static int target_aio_procs = TARGET_AIO_PROCS;
   97 static int max_queue_count = MAX_AIO_QUEUE;
   98 static int num_queue_count = 0;
   99 static int num_buf_aio = 0;
  100 static int num_aio_resv_start = 0;
  101 static int aiod_timeout;
  102 static int aiod_lifetime;
  103 
  104 static int max_aio_per_proc = MAX_AIO_PER_PROC,
  105         max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC;
  106 
  107 static int max_buf_aio = MAX_BUF_AIO;
  108 
  109 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
  110 
  111 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
  112         CTLFLAG_RW, &max_aio_per_proc, 0, "");
  113 
  114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
  115         CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
  116 
  117 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
  118         CTLFLAG_RW, &max_aio_procs, 0, "");
  119 
  120 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
  121         CTLFLAG_RD, &num_aio_procs, 0, "");
  122 
  123 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
  124         CTLFLAG_RD, &num_queue_count, 0, "");
  125 
  126 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
  127         CTLFLAG_RW, &max_queue_count, 0, "");
  128 
  129 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
  130         CTLFLAG_RW, &target_aio_procs, 0, "");
  131 
  132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
  133         CTLFLAG_RW, &max_buf_aio, 0, "");
  134 
  135 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
  136         CTLFLAG_RD, &num_buf_aio, 0, "");
  137 
  138 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
  139         CTLFLAG_RW, &aiod_lifetime, 0, "");
  140 
  141 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
  142         CTLFLAG_RW, &aiod_timeout, 0, "");
  143 
  144 
  145 /*
  146  * Job queue item
  147  */
  148 
  149 #define AIOCBLIST_CANCELLED     0x1
  150 #define AIOCBLIST_RUNDOWN       0x4
  151 #define AIOCBLIST_ASYNCFREE     0x8
  152 #define AIOCBLIST_DONE          0x10
  153 
  154 struct aiocblist {
  155         TAILQ_ENTRY (aiocblist) list;           /* List of jobs */
  156         TAILQ_ENTRY (aiocblist) plist;          /* List of jobs for proc */
  157         int     jobflags;
  158         int     jobstate;
  159         int inputcharge, outputcharge;
  160         struct  buf *bp;                                /* buffer pointer */
  161         struct  proc *userproc;                 /* User process */
  162         struct  aioproclist     *jobaioproc;    /* AIO process descriptor */
  163         struct  aio_liojob      *lio;           /* optional lio job */
  164         struct  aiocb *uuaiocb;                 /* pointer in userspace of aiocb */
  165         struct  aiocb uaiocb;                   /* Kernel I/O control block */
  166 };
  167 
  168 
  169 /*
  170  * AIO process info
  171  */
  172 #define AIOP_FREE       0x1                     /* proc on free queue */
  173 #define AIOP_SCHED      0x2                     /* proc explicitly scheduled */
  174 
  175 struct aioproclist {
  176         int aioprocflags;                       /* AIO proc flags */
  177         TAILQ_ENTRY(aioproclist) list;          /* List of processes */
  178         struct proc *aioproc;                   /* The AIO thread */
  179         TAILQ_HEAD (,aiocblist) jobtorun;       /* suggested job to run */
  180 };
  181 
  182 /*
  183  * data-structure for lio signal management
  184  */
  185 struct aio_liojob {
  186         int lioj_flags;
  187         int     lioj_buffer_count;
  188         int     lioj_buffer_finished_count;
  189         int     lioj_queue_count;
  190         int     lioj_queue_finished_count;
  191         struct sigevent lioj_signal;    /* signal on all I/O done */
  192         TAILQ_ENTRY (aio_liojob) lioj_list;
  193         struct kaioinfo *lioj_ki;
  194 };
  195 #define LIOJ_SIGNAL                     0x1 /* signal on all done (lio) */
  196 #define LIOJ_SIGNAL_POSTED      0x2     /* signal has been posted */
  197 
  198 /*
  199  * per process aio data structure
  200  */
  201 struct kaioinfo {
  202         int     kaio_flags;                     /* per process kaio flags */
  203         int     kaio_maxactive_count;   /* maximum number of AIOs */
  204         int     kaio_active_count;      /* number of currently used AIOs */
  205         int     kaio_qallowed_count;    /* maxiumu size of AIO queue */
  206         int     kaio_queue_count;       /* size of AIO queue */
  207         int     kaio_ballowed_count;    /* maximum number of buffers */
  208         int     kaio_queue_finished_count;      /* number of daemon jobs finished */
  209         int     kaio_buffer_count;      /* number of physio buffers */
  210         int     kaio_buffer_finished_count;     /* count of I/O done */
  211         struct proc *kaio_p;                    /* process that uses this kaio block */
  212         TAILQ_HEAD (,aio_liojob) kaio_liojoblist;       /* list of lio jobs */
  213         TAILQ_HEAD (,aiocblist) kaio_jobqueue;  /* job queue for process */
  214         TAILQ_HEAD (,aiocblist) kaio_jobdone;   /* done queue for process */
  215         TAILQ_HEAD (,aiocblist) kaio_bufqueue;  /* buffer job queue for process */
  216         TAILQ_HEAD (,aiocblist) kaio_bufdone;   /* buffer done queue for process */
  217 };
  218 
  219 #define KAIO_RUNDOWN 0x1                /* process is being run down */
  220 #define KAIO_WAKEUP 0x2                 /* wakeup process when there is a significant
  221                                                                    event */
  222 
  223 
  224 static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
  225 static TAILQ_HEAD(,aiocblist) aio_jobs;                 /* Async job list */
  226 static TAILQ_HEAD(,aiocblist) aio_bufjobs;              /* Phys I/O job list */
  227 static TAILQ_HEAD(,aiocblist) aio_freejobs;             /* Pool of free jobs */
  228 
  229 static void aio_init_aioinfo(struct proc *p) ;
  230 static void aio_onceonly(void *) ;
  231 static int aio_free_entry(struct aiocblist *aiocbe);
  232 static void aio_process(struct aiocblist *aiocbe);
  233 static int aio_newproc(void) ;
  234 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
  235 static void aio_physwakeup(struct buf *bp);
  236 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
  237 static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
  238 static void aio_daemon(void *uproc);
  239 
  240 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
  241 
  242 static vm_zone_t kaio_zone=0, aiop_zone=0,
  243         aiocb_zone=0, aiol_zone=0, aiolio_zone=0;
  244 
  245 /*
  246  * Single AIOD vmspace shared amongst all of them
  247  */
  248 struct vmspace *aiovmspace = NULL;
  249 
  250 /*
  251  * Startup initialization
  252  */
  253 void
  254 aio_onceonly(void *na)
  255 {
  256         TAILQ_INIT(&aio_freeproc);
  257         TAILQ_INIT(&aio_activeproc);
  258         TAILQ_INIT(&aio_jobs);
  259         TAILQ_INIT(&aio_bufjobs);
  260         TAILQ_INIT(&aio_freejobs);
  261         kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
  262         aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
  263         aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
  264         aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
  265         aiolio_zone = zinit("AIOLIO",
  266                 AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1);
  267         aiod_timeout = AIOD_TIMEOUT_DEFAULT;
  268         aiod_lifetime = AIOD_LIFETIME_DEFAULT;
  269         jobrefid = 1;
  270 }
  271 
  272 /*
  273  * Init the per-process aioinfo structure.
  274  * The aioinfo limits are set per-process for user limit (resource) management.
  275  */
  276 void
  277 aio_init_aioinfo(struct proc *p)
  278 {
  279         struct kaioinfo *ki;
  280         if (p->p_aioinfo == NULL) {
  281                 ki = zalloc(kaio_zone);
  282                 p->p_aioinfo = ki;
  283                 ki->kaio_flags = 0;
  284                 ki->kaio_maxactive_count = max_aio_per_proc;
  285                 ki->kaio_active_count = 0;
  286                 ki->kaio_qallowed_count = max_aio_queue_per_proc;
  287                 ki->kaio_queue_count = 0;
  288                 ki->kaio_ballowed_count = max_buf_aio;
  289                 ki->kaio_buffer_count = 0;
  290                 ki->kaio_buffer_finished_count = 0;
  291                 ki->kaio_p = p;
  292                 TAILQ_INIT(&ki->kaio_jobdone);
  293                 TAILQ_INIT(&ki->kaio_jobqueue);
  294                 TAILQ_INIT(&ki->kaio_bufdone);
  295                 TAILQ_INIT(&ki->kaio_bufqueue);
  296                 TAILQ_INIT(&ki->kaio_liojoblist);
  297         }
  298 }
  299 
  300 /*
  301  * Free a job entry.  Wait for completion if it is currently
  302  * active, but don't delay forever.  If we delay, we return
  303  * a flag that says that we have to restart the queue scan.
  304  */
  305 int
  306 aio_free_entry(struct aiocblist *aiocbe)
  307 {
  308         struct kaioinfo *ki;
  309         struct aioproclist *aiop;
  310         struct aio_liojob *lj;
  311         struct proc *p;
  312         int error;
  313         int s;
  314 
  315         if (aiocbe->jobstate == JOBST_NULL)
  316                 panic("aio_free_entry: freeing already free job");
  317 
  318         p = aiocbe->userproc;
  319         ki = p->p_aioinfo;
  320         lj = aiocbe->lio;
  321         if (ki == NULL)
  322                 panic("aio_free_entry: missing p->p_aioinfo");
  323 
  324         if (aiocbe->jobstate == JOBST_JOBRUNNING) {
  325                 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
  326                         return 0;
  327                 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
  328                 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
  329         }
  330         aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
  331 
  332         if (aiocbe->bp == NULL) {
  333                 if (ki->kaio_queue_count <= 0)
  334                         panic("aio_free_entry: process queue size <= 0");
  335                 if (num_queue_count <= 0)
  336                         panic("aio_free_entry: system wide queue size <= 0");
  337         
  338                 if(lj) {
  339                         lj->lioj_queue_count--;
  340                         if (aiocbe->jobflags & AIOCBLIST_DONE)
  341                                 lj->lioj_queue_finished_count--;
  342                 }
  343                 ki->kaio_queue_count--;
  344                 if (aiocbe->jobflags & AIOCBLIST_DONE)
  345                         ki->kaio_queue_finished_count--;
  346                 num_queue_count--;
  347 
  348         } else {
  349                 if(lj) {
  350                         lj->lioj_buffer_count--;
  351                         if (aiocbe->jobflags & AIOCBLIST_DONE)
  352                                 lj->lioj_buffer_finished_count--;
  353                 }
  354                 if (aiocbe->jobflags & AIOCBLIST_DONE)
  355                         ki->kaio_buffer_finished_count--;
  356                 ki->kaio_buffer_count--;
  357                 num_buf_aio--;
  358 
  359         }
  360 
  361         if ((ki->kaio_flags & KAIO_WAKEUP) ||
  362                 (ki->kaio_flags & KAIO_RUNDOWN) &&
  363                 ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) {
  364                 ki->kaio_flags &= ~KAIO_WAKEUP;
  365                 wakeup(p);
  366         }
  367 
  368         if ( aiocbe->jobstate == JOBST_JOBQBUF) {
  369                 if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
  370                         return error;
  371                 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
  372                         panic("aio_free_entry: invalid physio finish-up state");
  373                 s = splbio();
  374                 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
  375                 splx(s);
  376         } else if ( aiocbe->jobstate == JOBST_JOBQPROC) {
  377                 aiop = aiocbe->jobaioproc;
  378                 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
  379         } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
  380                 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
  381         } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
  382                 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
  383         } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) {
  384                 s = splbio();
  385                 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
  386                 splx(s);
  387                 if (aiocbe->bp) {
  388                         vunmapbuf(aiocbe->bp);
  389                         relpbuf(aiocbe->bp);
  390                         aiocbe->bp = NULL;
  391                 }
  392         }
  393         if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
  394                 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
  395                 zfree(aiolio_zone, lj);
  396         }
  397         TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
  398         aiocbe->jobstate = JOBST_NULL;
  399         return 0;
  400 }
  401 
  402 /*
  403  * Rundown the jobs for a given process.  
  404  */
  405 void
  406 aio_proc_rundown(struct proc *p)
  407 {
  408         int s;
  409         struct kaioinfo *ki;
  410         struct aio_liojob *lj, *ljn;
  411         struct aiocblist *aiocbe, *aiocbn;
  412         
  413         ki = p->p_aioinfo;
  414         if (ki == NULL)
  415                 return;
  416 
  417         ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
  418         while ((ki->kaio_active_count > 0) ||
  419                 (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) {
  420                 ki->kaio_flags |= KAIO_RUNDOWN;
  421                 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
  422                         break;
  423         }
  424 
  425 restart1:
  426         for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
  427                 aiocbe;
  428                 aiocbe = aiocbn) {
  429                 aiocbn = TAILQ_NEXT(aiocbe, plist);
  430                 if (aio_free_entry(aiocbe))
  431                         goto restart1;
  432         }
  433 
  434 restart2:
  435         for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
  436                 aiocbe;
  437                 aiocbe = aiocbn) {
  438                 aiocbn = TAILQ_NEXT(aiocbe, plist);
  439                 if (aio_free_entry(aiocbe))
  440                         goto restart2;
  441         }
  442 
  443 /*
  444  * Note the use of lots of splbio here, trying to avoid
  445  * splbio for long chains of I/O.  Probably unnecessary.
  446  */
  447 
  448 restart3:
  449         s = splbio();
  450         while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
  451                 ki->kaio_flags |= KAIO_WAKEUP;
  452                 tsleep (p, PRIBIO, "aioprn", 0);        
  453                 splx(s);
  454                 goto restart3;
  455         }
  456         splx(s);
  457 
  458 restart4:
  459         s = splbio();
  460         for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone);
  461                 aiocbe;
  462                 aiocbe = aiocbn) {
  463                 aiocbn = TAILQ_NEXT(aiocbe, plist);
  464                 if (aio_free_entry(aiocbe)) {
  465                         splx(s);
  466                         goto restart4;
  467                 }
  468         }
  469         splx(s);
  470 
  471         for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist);
  472                   lj;
  473                   lj = ljn) {
  474                         ljn = TAILQ_NEXT(lj, lioj_list);
  475                         if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
  476                                 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
  477                                 zfree(aiolio_zone, lj);
  478                         } else {
  479 #if defined(DIAGNOSTIC)
  480                                 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n",
  481                                         lj->lioj_buffer_count, lj->lioj_buffer_finished_count,
  482                                         lj->lioj_queue_count, lj->lioj_queue_finished_count);
  483 #endif
  484                         }
  485         }
  486 
  487         zfree(kaio_zone, ki);
  488         p->p_aioinfo = NULL;
  489 }
  490 
  491 /*
  492  * Select a job to run (called by an AIO daemon)
  493  */
  494 static struct aiocblist *
  495 aio_selectjob(struct aioproclist *aiop)
  496 {
  497 
  498         struct aiocblist *aiocbe;
  499 
  500         aiocbe = TAILQ_FIRST(&aiop->jobtorun);
  501         if (aiocbe) {
  502                 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
  503                 return aiocbe;
  504         }
  505 
  506         for (aiocbe = TAILQ_FIRST(&aio_jobs);
  507                 aiocbe;
  508                 aiocbe = TAILQ_NEXT(aiocbe, list)) {
  509                 struct kaioinfo *ki;
  510                 struct proc *userp;
  511 
  512                 userp = aiocbe->userproc;
  513                 ki = userp->p_aioinfo;
  514 
  515                 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
  516                         TAILQ_REMOVE(&aio_jobs, aiocbe, list);
  517                         return aiocbe;
  518                 }
  519         }
  520 
  521         return NULL;
  522 }
  523 
  524 /*
  525  * The AIO processing activity.  This is the code that does the
  526  * I/O request for the non-physio version of the operations.  The
  527  * normal vn operations are used, and this code should work in
  528  * all instances for every type of file, including pipes, sockets,
  529  * fifos, and regular files.
  530  */
  531 void
  532 aio_process(struct aiocblist *aiocbe)
  533 {
  534         struct filedesc *fdp;
  535         struct proc *userp, *mycp;
  536         struct aiocb *cb;
  537         struct file *fp;
  538         struct uio auio;
  539         struct iovec aiov;
  540         unsigned int fd;
  541         int cnt;
  542         int error;
  543         off_t offset;
  544         int oublock_st, oublock_end;
  545         int inblock_st, inblock_end;
  546 
  547         userp = aiocbe->userproc;
  548         cb = &aiocbe->uaiocb;
  549 
  550         mycp = curproc;
  551 
  552         fdp = mycp->p_fd;
  553         fd = cb->aio_fildes;
  554         fp = fdp->fd_ofiles[fd];
  555 
  556         aiov.iov_base = (void *) cb->aio_buf;
  557         aiov.iov_len = cb->aio_nbytes;
  558 
  559         auio.uio_iov = &aiov;
  560         auio.uio_iovcnt = 1;
  561         auio.uio_offset = offset = cb->aio_offset;
  562         auio.uio_resid = cb->aio_nbytes;
  563         cnt = cb->aio_nbytes;
  564         auio.uio_segflg = UIO_USERSPACE;
  565         auio.uio_procp = mycp;
  566 
  567         inblock_st = mycp->p_stats->p_ru.ru_inblock;
  568         oublock_st = mycp->p_stats->p_ru.ru_oublock;
  569         if (cb->aio_lio_opcode == LIO_READ) {
  570                 auio.uio_rw = UIO_READ;
  571                 error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, FOF_OFFSET);
  572         } else {
  573                 auio.uio_rw = UIO_WRITE;
  574                 error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, FOF_OFFSET);
  575         }
  576         inblock_end = mycp->p_stats->p_ru.ru_inblock;
  577         oublock_end = mycp->p_stats->p_ru.ru_oublock;
  578 
  579         aiocbe->inputcharge = inblock_end - inblock_st;
  580         aiocbe->outputcharge = oublock_end - oublock_st;
  581 
  582         if (error) {
  583                 if (auio.uio_resid != cnt) {
  584                         if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
  585                                 error = 0;
  586                         if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
  587                                 psignal(userp, SIGPIPE);
  588                 }
  589         }
  590 
  591         cnt -= auio.uio_resid;
  592         cb->_aiocb_private.error = error;
  593         cb->_aiocb_private.status = cnt;
  594         
  595         return;
  596 
  597 }
  598 
  599 /*
  600  * The AIO daemon, most of the actual work is done in aio_process,
  601  * but the setup (and address space mgmt) is done in this routine.
  602  */
  603 static void
  604 aio_daemon(void *uproc)
  605 {
  606         int s;
  607         struct aioproclist *aiop;
  608         struct vmspace *myvm, *aiovm;
  609         struct proc *mycp;
  610 
  611         /*
  612          * Local copies of curproc (cp) and vmspace (myvm)
  613          */
  614         mycp = curproc;
  615         myvm = mycp->p_vmspace;
  616 
  617         /*
  618          * We manage to create only one VM space for all AIOD processes.
  619          * The VM space for the first AIOD created becomes the shared VM
  620          * space for all of them.  We add an additional reference count,
  621          * even for the first AIOD, so the address space does not go away,
  622          * and we continue to use that original VM space even if the first
  623          * AIOD exits.
  624          */
  625         if ((aiovm = aiovmspace) == NULL) {
  626                 aiovmspace = myvm;
  627                 myvm->vm_refcnt++;
  628                 /*
  629                  * Remove userland cruft from address space.
  630                  */
  631                 if (myvm->vm_shm)
  632                         shmexit(mycp);
  633                 pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK);
  634                 vm_map_remove(&myvm->vm_map, 0, USRSTACK);
  635                 myvm->vm_tsize = 0;
  636                 myvm->vm_dsize = 0;
  637                 myvm->vm_ssize = 0;
  638         } else {
  639                 aiovm->vm_refcnt++;
  640                 mycp->p_vmspace = aiovm;
  641                 pmap_activate(mycp);
  642                 vmspace_free(myvm);
  643                 myvm = aiovm;
  644         }
  645 
  646         if (mycp->p_textvp) {
  647                 vrele(mycp->p_textvp);
  648                 mycp->p_textvp = NULL;
  649         }
  650 
  651         /*
  652          * Allocate and ready the aio control info.  There is one
  653          * aiop structure per daemon.
  654          */
  655         aiop = zalloc(aiop_zone);
  656         aiop->aioproc = mycp;
  657         aiop->aioprocflags |= AIOP_FREE;
  658         TAILQ_INIT(&aiop->jobtorun);
  659 
  660         /*
  661          * Place thread (lightweight process) onto the AIO free thread list
  662          */
  663         if (TAILQ_EMPTY(&aio_freeproc))
  664                 wakeup(&aio_freeproc);
  665         TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
  666 
  667         /*
  668          * Make up a name for the daemon
  669          */
  670         strcpy(mycp->p_comm, "aiod");
  671 
  672         /*
  673          * Get rid of our current filedescriptors.  AIOD's don't need any
  674          * filedescriptors, except as temporarily inherited from the client.
  675          * Credentials are also cloned, and made equivalent to "root."
  676          */
  677         fdfree(mycp);
  678         mycp->p_fd = NULL;
  679         mycp->p_ucred = crcopy(mycp->p_ucred);
  680         mycp->p_ucred->cr_uid = 0;
  681         mycp->p_ucred->cr_ngroups = 1;
  682         mycp->p_ucred->cr_groups[0] = 1;
  683 
  684         /*
  685          * The daemon resides in its own pgrp.
  686          */
  687         enterpgrp(mycp, mycp->p_pid, 1);
  688 
  689         /*
  690          * Mark special process type
  691          */
  692         mycp->p_flag |= P_SYSTEM|P_KTHREADP;
  693 
  694         /*
  695          * Wakeup parent process.  (Parent sleeps to keep from blasting away
  696          * creating to many daemons.)
  697          */
  698         wakeup(mycp);
  699 
  700         while(1) {
  701                 struct proc *curcp;
  702                 struct  aiocblist *aiocbe;
  703 
  704                 /*
  705                  * curcp is the current daemon process context.
  706                  * userp is the current user process context.
  707                  */
  708                 curcp = mycp;
  709 
  710                 /*
  711                  * Take daemon off of free queue
  712                  */
  713                 if (aiop->aioprocflags & AIOP_FREE) {
  714                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
  715                         TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
  716                         aiop->aioprocflags &= ~AIOP_FREE;
  717                 }
  718                 aiop->aioprocflags &= ~AIOP_SCHED;
  719 
  720                 /*
  721                  * Check for jobs
  722                  */
  723                 while ( aiocbe = aio_selectjob(aiop)) {
  724                         struct proc *userp;
  725                         struct aiocb *cb;
  726                         struct kaioinfo *ki;
  727                         struct aio_liojob *lj;
  728 
  729                         cb = &aiocbe->uaiocb;
  730                         userp = aiocbe->userproc;
  731 
  732                         aiocbe->jobstate = JOBST_JOBRUNNING;
  733 
  734                         /*
  735                          * Connect to process address space for user program
  736                          */
  737                         if (userp != curcp) {
  738                                 struct vmspace *tmpvm;
  739                                 /*
  740                                  * Save the current address space that we are connected to.
  741                                  */
  742                                 tmpvm = mycp->p_vmspace;
  743                                 /*
  744                                  * Point to the new user address space, and refer to it.
  745                                  */
  746                                 mycp->p_vmspace = userp->p_vmspace;
  747                                 mycp->p_vmspace->vm_refcnt++;
  748                                 /*
  749                                  * Activate the new mapping.
  750                                  */
  751                                 pmap_activate(mycp);
  752                                 /*
  753                                  * If the old address space wasn't the daemons own address
  754                                  * space, then we need to remove the daemon's reference from
  755                                  * the other process that it was acting on behalf of.
  756                                  */
  757                                 if (tmpvm != myvm) {
  758                                         vmspace_free(tmpvm);
  759                                 }
  760                                 /*
  761                                  * Disassociate from previous clients file descriptors, and
  762                                  * associate to the new clients descriptors.  Note that
  763                                  * the daemon doesn't need to worry about its orginal
  764                                  * descriptors, because they were originally freed.
  765                                  */
  766                                 if (mycp->p_fd)
  767                                         fdfree(mycp);
  768                                 mycp->p_fd = fdshare(userp);
  769                                 curcp = userp;
  770                         }
  771 
  772                         ki = userp->p_aioinfo;
  773                         lj = aiocbe->lio;
  774 
  775                         /*
  776                          * Account for currently active jobs
  777                          */
  778                         ki->kaio_active_count++;
  779 
  780                         /*
  781                          * Do the I/O function
  782                          */
  783                         aiocbe->jobaioproc = aiop;
  784                         aio_process(aiocbe);
  785 
  786                         /*
  787                          * decrement the active job count
  788                          */
  789                         ki->kaio_active_count--;
  790 
  791                         /*
  792                          * increment the completion count for wakeup/signal comparisons
  793                          */
  794                         aiocbe->jobflags |= AIOCBLIST_DONE;
  795                         ki->kaio_queue_finished_count++;
  796                         if (lj) {
  797                                 lj->lioj_queue_finished_count++;
  798                         }
  799                         if ((ki->kaio_flags & KAIO_WAKEUP) ||
  800                                 (ki->kaio_flags & KAIO_RUNDOWN) &&
  801                                 (ki->kaio_active_count == 0)) {
  802                                 ki->kaio_flags &= ~KAIO_WAKEUP;
  803                                 wakeup(userp);
  804                         }
  805 
  806                         s = splbio();
  807                         if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
  808                                 LIOJ_SIGNAL) {
  809                                 if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) &&
  810                                         (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) {
  811                                                 psignal(userp, lj->lioj_signal.sigev_signo);
  812                                                 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
  813                                 }
  814                         }
  815                         splx(s);
  816 
  817                         aiocbe->jobstate = JOBST_JOBFINISHED;
  818 
  819                         /*
  820                          * If the I/O request should be automatically rundown, do the
  821                          * needed cleanup.  Otherwise, place the queue entry for
  822                          * the just finished I/O request into the done queue for the
  823                          * associated client.
  824                          */
  825                         if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
  826                                 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
  827                                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
  828                         } else {
  829                                 TAILQ_REMOVE(&ki->kaio_jobqueue,
  830                                         aiocbe, plist);
  831                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
  832                                         aiocbe, plist);
  833                         }
  834 
  835                         if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
  836                                 wakeup(aiocbe);
  837                                 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
  838                         }
  839 
  840                         if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
  841                                 psignal(userp, cb->aio_sigevent.sigev_signo);
  842                         }
  843                 }
  844 
  845                 /*
  846                  * Disconnect from user address space
  847                  */
  848                 if (curcp != mycp) {
  849                         struct vmspace *tmpvm;
  850                         /*
  851                          * Get the user address space to disconnect from.
  852                          */
  853                         tmpvm = mycp->p_vmspace;
  854                         /*
  855                          * Get original address space for daemon.
  856                          */
  857                         mycp->p_vmspace = myvm;
  858                         /*
  859                          * Activate the daemon's address space.
  860                          */
  861                         pmap_activate(mycp);
  862 #if defined(DIAGNOSTIC)
  863                         if (tmpvm == myvm)
  864                                 printf("AIOD: vmspace problem -- %d\n", mycp->p_pid);
  865 #endif
  866                         /*
  867                          * remove our vmspace reference.
  868                          */
  869                         vmspace_free(tmpvm);
  870                         /*
  871                          * disassociate from the user process's file descriptors.
  872                          */
  873                         if (mycp->p_fd)
  874                                 fdfree(mycp);
  875                         mycp->p_fd = NULL;
  876                         curcp = mycp;
  877                 }
  878 
  879                 /*
  880                  * If we are the first to be put onto the free queue, wakeup
  881                  * anyone waiting for a daemon.
  882                  */
  883                 TAILQ_REMOVE(&aio_activeproc, aiop, list);
  884                 if (TAILQ_EMPTY(&aio_freeproc))
  885                         wakeup(&aio_freeproc);
  886                 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
  887                 aiop->aioprocflags |= AIOP_FREE;
  888 
  889                 /*
  890                  * If daemon is inactive for a long time, allow it to exit, thereby
  891                  * freeing resources.
  892                  */
  893                 if (((aiop->aioprocflags & AIOP_SCHED) == 0) &&
  894                         tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) {
  895                         if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
  896                                 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
  897                                 if ((aiop->aioprocflags & AIOP_FREE) &&
  898                                         (num_aio_procs > target_aio_procs)) {
  899                                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
  900                                         zfree(aiop_zone, aiop);
  901                                         num_aio_procs--;
  902 #if defined(DIAGNOSTIC)
  903                                         if (mycp->p_vmspace->vm_refcnt <= 1)
  904                                                 printf("AIOD: bad vm refcnt for exiting daemon: %d\n",
  905                                                         mycp->p_vmspace->vm_refcnt);
  906 #endif
  907                                         exit1(mycp, 0);
  908                                 }
  909                         }
  910                 }
  911         }
  912 }
  913 
  914 /*
  915  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.
  916  * The AIO daemon modifies its environment itself.
  917  */
  918 static int
  919 aio_newproc()
  920 {
  921         int error;
  922         struct rfork_args rfa;
  923         struct proc *p, *np;
  924 
  925         rfa.flags = RFPROC | RFCFDG;
  926 
  927         p = curproc;
  928         if (error = rfork(p, &rfa))
  929                 return error;
  930 
  931         np = pfind(p->p_retval[0]);
  932         cpu_set_fork_handler(np, aio_daemon, p);
  933 
  934         /*
  935          * Wait until daemon is started, but continue on just in case (to
  936          * handle error conditions.
  937          */
  938         error = tsleep(np, PZERO, "aiosta", aiod_timeout);
  939         num_aio_procs++;
  940 
  941         return error;
  942 
  943 }
  944 
  945 /*
  946  * Try the high-performance physio method for eligible VCHR devices.  This
  947  * routine doesn't require the use of any additional threads, and have
  948  * overhead.
  949  */
  950 int
  951 aio_qphysio(p, aiocbe)
  952         struct proc *p;
  953         struct aiocblist *aiocbe;
  954 {
  955         int error;
  956         struct aiocb *cb;
  957         struct file *fp;
  958         struct buf *bp;
  959         int bflags;
  960         struct vnode *vp;
  961         struct kaioinfo *ki;
  962         struct filedesc *fdp;
  963         struct aio_liojob *lj;
  964         int fd;
  965         int majordev;
  966         int s;
  967         int cnt;
  968         dev_t dev;
  969         int rw;
  970         d_strategy_t *fstrategy;
  971         struct cdevsw *cdev;
  972         struct cdevsw *bdev;
  973 
  974         cb = &aiocbe->uaiocb;
  975         fdp = p->p_fd;
  976         fd = cb->aio_fildes;
  977         fp = fdp->fd_ofiles[fd];
  978 
  979         if (fp->f_type != DTYPE_VNODE) {
  980                 return -1;
  981         }
  982 
  983         vp = (struct vnode *)fp->f_data;
  984         if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) {
  985                 return -1;
  986         }
  987 
  988         if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) {
  989                 return -1;
  990         }
  991 
  992         if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) {
  993                 return -1;
  994         }
  995 
  996         majordev = major(vp->v_rdev);
  997         if (majordev == NODEV) {
  998                 return -1;
  999         }
 1000 
 1001         cdev = cdevsw[major(vp->v_rdev)];
 1002         if (cdev == NULL) {
 1003                 return -1;
 1004         }
 1005 
 1006         if (cdev->d_bmaj == -1) {
 1007                 return -1;
 1008         }
 1009         bdev = cdev;
 1010 
 1011         ki = p->p_aioinfo;
 1012         if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
 1013                 return -1;
 1014         }
 1015 
 1016         cnt = cb->aio_nbytes;
 1017         if (cnt > MAXPHYS) {
 1018                 return -1;
 1019         }
 1020 
 1021         dev = makedev(bdev->d_bmaj, minor(vp->v_rdev));
 1022 
 1023         /*
 1024          * Physical I/O is charged directly to the process, so we don't have
 1025          * to fake it.
 1026          */
 1027         aiocbe->inputcharge = 0;
 1028         aiocbe->outputcharge = 0;
 1029 
 1030         ki->kaio_buffer_count++;
 1031 
 1032         lj = aiocbe->lio;
 1033         if (lj) {
 1034                 lj->lioj_buffer_count++;
 1035         }
 1036 
 1037         /* create and build a buffer header for a transfer */
 1038         bp = (struct buf *)getpbuf();
 1039 
 1040         /*
 1041          * get a copy of the kva from the physical buffer
 1042          */
 1043         bp->b_proc = p;
 1044         bp->b_dev = dev;
 1045         error = bp->b_error = 0;
 1046 
 1047         if (cb->aio_lio_opcode == LIO_WRITE) {
 1048                 rw = 0;
 1049                 bflags = B_WRITE;
 1050         } else {
 1051                 rw = 1;
 1052                 bflags = B_READ;
 1053         }
 1054         
 1055         bp->b_bcount = cb->aio_nbytes;
 1056         bp->b_bufsize = cb->aio_nbytes;
 1057         bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags;
 1058         bp->b_iodone = aio_physwakeup;
 1059         bp->b_saveaddr = bp->b_data;
 1060         bp->b_data = (void *) cb->aio_buf;
 1061         bp->b_blkno = btodb(cb->aio_offset);
 1062 
 1063         if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
 1064                 error = EFAULT;
 1065                 goto doerror;
 1066         }
 1067         if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
 1068                 error = EFAULT;
 1069                 goto doerror;
 1070         }
 1071 
 1072         /* bring buffer into kernel space */
 1073         vmapbuf(bp);
 1074 
 1075         s = splbio();
 1076         aiocbe->bp = bp;
 1077         bp->b_spc = (void *)aiocbe;
 1078         TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
 1079         TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
 1080         aiocbe->jobstate = JOBST_JOBQBUF;
 1081         cb->_aiocb_private.status = cb->aio_nbytes;
 1082         num_buf_aio++;
 1083         fstrategy = bdev->d_strategy;
 1084         bp->b_error = 0;
 1085 
 1086         splx(s);
 1087         /* perform transfer */
 1088         (*fstrategy)(bp);
 1089 
 1090         s = splbio();
 1091         /*
 1092          * If we had an error invoking the request, or an error in processing
 1093          * the request before we have returned, we process it as an error
 1094          * in transfer.  Note that such an I/O error is not indicated immediately,
 1095          * but is returned using the aio_error mechanism.  In this case, aio_suspend
 1096          * will return immediately.
 1097          */
 1098         if (bp->b_error || (bp->b_flags & B_ERROR)) {
 1099                 struct aiocb *job = aiocbe->uuaiocb;
 1100 
 1101                 aiocbe->uaiocb._aiocb_private.status = 0;
 1102                 suword(&job->_aiocb_private.status, 0);
 1103                 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
 1104                 suword(&job->_aiocb_private.error, bp->b_error);
 1105 
 1106                 ki->kaio_buffer_finished_count++;
 1107 
 1108                 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
 1109                         aiocbe->jobstate = JOBST_JOBBFINISHED;
 1110                         aiocbe->jobflags |= AIOCBLIST_DONE;
 1111                         TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
 1112                         TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
 1113                         TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
 1114                 }
 1115         }
 1116         splx(s);
 1117         return 0;
 1118 
 1119 doerror:
 1120         ki->kaio_buffer_count--;
 1121         if (lj) {
 1122                 lj->lioj_buffer_count--;
 1123         }
 1124         aiocbe->bp = NULL;
 1125         relpbuf(bp);
 1126         return error;
 1127 }
 1128 
 1129 /*
 1130  * This waits/tests physio completion.
 1131  */
 1132 int
 1133 aio_fphysio(p, iocb, flgwait)
 1134         struct proc *p;
 1135         struct aiocblist *iocb;
 1136         int flgwait;
 1137 {
 1138         int s;
 1139         struct buf *bp;
 1140         int error;
 1141 
 1142         bp = iocb->bp;
 1143 
 1144         s = splbio();
 1145         if (flgwait == 0) {
 1146                 if ((bp->b_flags & B_DONE) == 0) {
 1147                         splx(s);
 1148                         return EINPROGRESS;
 1149                 }
 1150         }
 1151 
 1152         while ((bp->b_flags & B_DONE) == 0) {
 1153                 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
 1154                         if ((bp->b_flags & B_DONE) == 0) {
 1155                                 splx(s);
 1156                                 return EINPROGRESS;
 1157                         } else {
 1158                                 break;
 1159                         }
 1160                 }
 1161         }
 1162 
 1163         /* release mapping into kernel space */
 1164         vunmapbuf(bp);
 1165         iocb->bp = 0;
 1166 
 1167         error = 0;
 1168         /*
 1169          * check for an error
 1170          */
 1171         if (bp->b_flags & B_ERROR) {
 1172                 error = bp->b_error;
 1173         }
 1174 
 1175         relpbuf(bp);
 1176         return (error);
 1177 }
 1178 
 1179 /*
 1180  * Queue a new AIO request.  Choosing either the threaded or direct physio
 1181  * VCHR technique is done in this code.
 1182  */
 1183 static int
 1184 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
 1185 {
 1186         struct filedesc *fdp;
 1187         struct file *fp;
 1188         unsigned int fd;
 1189 
 1190         int error;
 1191         int opcode;
 1192         struct aiocblist *aiocbe;
 1193         struct aioproclist *aiop;
 1194         struct kaioinfo *ki;
 1195 
 1196         if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
 1197                 TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
 1198         } else {
 1199                 aiocbe = zalloc (aiocb_zone);
 1200         }
 1201 
 1202         aiocbe->inputcharge = 0;
 1203         aiocbe->outputcharge = 0;
 1204 
 1205         suword(&job->_aiocb_private.status, -1);
 1206         suword(&job->_aiocb_private.error, 0);
 1207         suword(&job->_aiocb_private.kernelinfo, -1);
 1208 
 1209         error = copyin((caddr_t)job,
 1210                 (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
 1211         if (error) {
 1212                 suword(&job->_aiocb_private.error, error);
 1213 
 1214                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1215                 return error;
 1216         }
 1217 
 1218         /*
 1219          * Save userspace address of the job info
 1220          */
 1221         aiocbe->uuaiocb = job;
 1222 
 1223         /*
 1224          * Get the opcode
 1225          */
 1226         if (type != LIO_NOP) {
 1227                 aiocbe->uaiocb.aio_lio_opcode = type;
 1228         }
 1229         opcode = aiocbe->uaiocb.aio_lio_opcode;
 1230 
 1231         /*
 1232          * Get the fd info for process
 1233          */
 1234         fdp = p->p_fd;
 1235 
 1236         /*
 1237          * Range check file descriptor
 1238          */
 1239         fd = aiocbe->uaiocb.aio_fildes;
 1240         if (fd >= fdp->fd_nfiles) {
 1241                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1242                 if (type == 0) {
 1243                         suword(&job->_aiocb_private.error, EBADF);
 1244                 }
 1245                 return EBADF;
 1246         }
 1247 
 1248         fp = fdp->fd_ofiles[fd];
 1249         if ((fp == NULL) ||
 1250                 ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) {
 1251                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1252                 if (type == 0) {
 1253                         suword(&job->_aiocb_private.error, EBADF);
 1254                 }
 1255                 return EBADF;
 1256         }
 1257 
 1258         if (aiocbe->uaiocb.aio_offset == -1LL) {
 1259                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1260                 if (type == 0) {
 1261                         suword(&job->_aiocb_private.error, EINVAL);
 1262                 }
 1263                 return EINVAL;
 1264         }
 1265 
 1266         error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
 1267         if (error) {
 1268                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1269                 if (type == 0) {
 1270                         suword(&job->_aiocb_private.error, EINVAL);
 1271                 }
 1272                 return error;
 1273         }
 1274 
 1275         aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
 1276         if (jobrefid == LONG_MAX)
 1277                 jobrefid = 1;
 1278         else
 1279                 jobrefid++;
 1280         
 1281         if (opcode == LIO_NOP) {
 1282                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1283                 if (type == 0) {
 1284                         suword(&job->_aiocb_private.error, 0);
 1285                         suword(&job->_aiocb_private.status, 0);
 1286                         suword(&job->_aiocb_private.kernelinfo, 0);
 1287                 }
 1288                 return 0;
 1289         }
 1290 
 1291         if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
 1292                 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
 1293                 if (type == 0) {
 1294                         suword(&job->_aiocb_private.status, 0);
 1295                         suword(&job->_aiocb_private.error, EINVAL);
 1296                 }
 1297                 return EINVAL;
 1298         }
 1299 
 1300         suword(&job->_aiocb_private.error, EINPROGRESS);
 1301         aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
 1302         aiocbe->userproc = p;
 1303         aiocbe->jobflags = 0;
 1304         aiocbe->lio = lj;
 1305         ki = p->p_aioinfo;
 1306 
 1307         if ((error = aio_qphysio(p, aiocbe)) == 0) {
 1308                 return 0;
 1309         } else if (error > 0) {
 1310                 suword(&job->_aiocb_private.status, 0);
 1311                 aiocbe->uaiocb._aiocb_private.error = error;
 1312                 suword(&job->_aiocb_private.error, error);
 1313                 return error;
 1314         }
 1315 
 1316         /*
 1317          * No buffer for daemon I/O
 1318          */
 1319         aiocbe->bp = NULL;
 1320 
 1321         ki->kaio_queue_count++;
 1322         if (lj) {
 1323                 lj->lioj_queue_count++;
 1324         }
 1325         TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
 1326         TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
 1327         aiocbe->jobstate = JOBST_JOBQGLOBAL;
 1328 
 1329         num_queue_count++;
 1330         error = 0;
 1331 
 1332         /*
 1333          * If we don't have a free AIO process, and we are below our
 1334          * quota, then start one.  Otherwise, depend on the subsequent
 1335          * I/O completions to pick-up this job.  If we don't sucessfully
 1336          * create the new process (thread) due to resource issues, we
 1337          * return an error for now (EAGAIN), which is likely not the
 1338          * correct thing to do.
 1339          */
 1340 retryproc:
 1341         if (aiop = TAILQ_FIRST(&aio_freeproc)) {
 1342                 TAILQ_REMOVE(&aio_freeproc, aiop, list);
 1343                 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
 1344                 aiop->aioprocflags &= ~AIOP_FREE;
 1345                 wakeup(aiop->aioproc);
 1346         } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
 1347                         ((ki->kaio_active_count + num_aio_resv_start) <
 1348                                 ki->kaio_maxactive_count)) {
 1349                 num_aio_resv_start++;
 1350                 if ((error = aio_newproc()) == 0) {
 1351                         num_aio_resv_start--;
 1352                         p->p_retval[0] = 0;
 1353                         goto retryproc;
 1354                 }
 1355                 num_aio_resv_start--;
 1356         }
 1357         return error;
 1358 }
 1359 
 1360 /*
 1361  * This routine queues an AIO request, checking for quotas.
 1362  */
 1363 static int
 1364 aio_aqueue(struct proc *p, struct aiocb *job, int type)
 1365 {
 1366         struct kaioinfo *ki;
 1367 
 1368         if (p->p_aioinfo == NULL) {
 1369                 aio_init_aioinfo(p);
 1370         }
 1371 
 1372         if (num_queue_count >= max_queue_count)
 1373                 return EAGAIN;
 1374 
 1375         ki = p->p_aioinfo;
 1376         if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
 1377                 return EAGAIN;
 1378 
 1379         return _aio_aqueue(p, job, NULL, type);
 1380 }
 1381 
 1382 /*
 1383  * Support the aio_return system call, as a side-effect, kernel
 1384  * resources are released.
 1385  */
 1386 int
 1387 aio_return(struct proc *p, struct aio_return_args *uap)
 1388 {
 1389         int s;
 1390         int jobref;
 1391         struct aiocblist *cb, *ncb;
 1392         struct aiocb *ujob;
 1393         struct kaioinfo *ki;
 1394 
 1395         ki = p->p_aioinfo;
 1396         if (ki == NULL) {
 1397                 return EINVAL;
 1398         }
 1399 
 1400         ujob = uap->aiocbp;
 1401 
 1402         jobref = fuword(&ujob->_aiocb_private.kernelinfo);
 1403         if (jobref == -1 || jobref == 0)
 1404                 return EINVAL;
 1405 
 1406         for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
 1407                 cb;
 1408                 cb = TAILQ_NEXT(cb, plist)) {
 1409                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
 1410                         if (ujob == cb->uuaiocb) {
 1411                                 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
 1412                         } else {
 1413                                 p->p_retval[0] = EFAULT;
 1414                         }
 1415                         if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 1416                                 curproc->p_stats->p_ru.ru_oublock += cb->outputcharge;
 1417                                 cb->outputcharge = 0;
 1418                         } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 1419                                 curproc->p_stats->p_ru.ru_inblock += cb->inputcharge;
 1420                                 cb->inputcharge = 0;
 1421                         }
 1422                         aio_free_entry(cb);
 1423                         return 0;
 1424                 }
 1425         }
 1426 
 1427         s = splbio();
 1428         for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
 1429                 cb;
 1430                 cb = ncb) {
 1431                 ncb = TAILQ_NEXT(cb, plist);
 1432                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
 1433                         splx(s);
 1434                         if (ujob == cb->uuaiocb) {
 1435                                 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
 1436                         } else {
 1437                                 p->p_retval[0] = EFAULT;
 1438                         }
 1439                         aio_free_entry(cb);
 1440                         return 0;
 1441                 }
 1442         }
 1443         splx(s);
 1444 
 1445         return (EINVAL);
 1446 }
 1447 
 1448 /*
 1449  * Allow a process to wakeup when any of the I/O requests are
 1450  * completed.
 1451  */
 1452 int
 1453 aio_suspend(struct proc *p, struct aio_suspend_args *uap)
 1454 {
 1455         struct timeval atv;
 1456         struct timespec ts;
 1457         struct aiocb *const *cbptr, *cbp;
 1458         struct kaioinfo *ki;
 1459         struct aiocblist *cb;
 1460         int i;
 1461         int njoblist;
 1462         int error, s, timo;
 1463         int *ijoblist;
 1464         struct aiocb **ujoblist;
 1465         
 1466         if (uap->nent >= AIO_LISTIO_MAX)
 1467                 return EINVAL;
 1468 
 1469         timo = 0;
 1470         if (uap->timeout) {
 1471                 /*
 1472                  * Get timespec struct
 1473                  */
 1474                 if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
 1475                         return error;
 1476                 }
 1477 
 1478                 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
 1479                         return (EINVAL);
 1480 
 1481                 TIMESPEC_TO_TIMEVAL(&atv, &ts);
 1482                 if (itimerfix(&atv))
 1483                         return (EINVAL);
 1484                 timo = tvtohz(&atv);
 1485         }
 1486 
 1487         ki = p->p_aioinfo;
 1488         if (ki == NULL)
 1489                 return EAGAIN;
 1490 
 1491         njoblist = 0;
 1492         ijoblist = zalloc(aiol_zone);
 1493         ujoblist = zalloc(aiol_zone);
 1494         cbptr = uap->aiocbp;
 1495 
 1496         for(i = 0; i < uap->nent; i++) {
 1497                 cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
 1498                 if (cbp == 0)
 1499                         continue;
 1500                 ujoblist[njoblist] = cbp;
 1501                 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
 1502                 njoblist++;
 1503         }
 1504         if (njoblist == 0) {
 1505                 zfree(aiol_zone, ijoblist);
 1506                 zfree(aiol_zone, ujoblist);
 1507                 return 0;
 1508         }
 1509 
 1510         error = 0;
 1511         while (1) {
 1512                 for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
 1513                         cb; cb = TAILQ_NEXT(cb, plist)) {
 1514                         for(i = 0; i < njoblist; i++) {
 1515                                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
 1516                                         ijoblist[i]) {
 1517                                         if (ujoblist[i] != cb->uuaiocb)
 1518                                                 error = EINVAL;
 1519                                         zfree(aiol_zone, ijoblist);
 1520                                         zfree(aiol_zone, ujoblist);
 1521                                         return error;
 1522                                 }
 1523                         }
 1524                 }
 1525 
 1526                 s = splbio();
 1527                 for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
 1528                         cb; cb = TAILQ_NEXT(cb, plist)) {
 1529                         for(i = 0; i < njoblist; i++) {
 1530                                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
 1531                                         ijoblist[i]) {
 1532                                         splx(s);
 1533                                         if (ujoblist[i] != cb->uuaiocb)
 1534                                                 error = EINVAL;
 1535                                         zfree(aiol_zone, ijoblist);
 1536                                         zfree(aiol_zone, ujoblist);
 1537                                         return error;
 1538                                 }
 1539                         }
 1540                 }
 1541 
 1542                 ki->kaio_flags |= KAIO_WAKEUP;
 1543                 error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
 1544                 splx(s);
 1545 
 1546                 if (error == EINTR) {
 1547                         zfree(aiol_zone, ijoblist);
 1548                         zfree(aiol_zone, ujoblist);
 1549                         return EINTR;
 1550                 } else if (error == EWOULDBLOCK) {
 1551                         zfree(aiol_zone, ijoblist);
 1552                         zfree(aiol_zone, ujoblist);
 1553                         return EAGAIN;
 1554                 }
 1555         }
 1556 
 1557 /* NOTREACHED */
 1558         return EINVAL;
 1559 }
 1560 
 1561 /*
 1562  * aio_cancel at the kernel level is a NOOP right now.  It
 1563  * might be possible to support it partially in user mode, or
 1564  * in kernel mode later on.
 1565  */
 1566 int
 1567 aio_cancel(struct proc *p, struct aio_cancel_args *uap)
 1568 {
 1569       return ENOSYS;
 1570 }
 1571 
 1572 /*
 1573  * aio_error is implemented in the kernel level for compatibility
 1574  * purposes only.  For a user mode async implementation, it would be
 1575  * best to do it in a userland subroutine.
 1576  */
 1577 int
 1578 aio_error(struct proc *p, struct aio_error_args *uap)
 1579 {
 1580         int s;
 1581         struct aiocblist *cb;
 1582         struct kaioinfo *ki;
 1583         int jobref;
 1584 
 1585         ki = p->p_aioinfo;
 1586         if (ki == NULL)
 1587                 return EINVAL;
 1588 
 1589         jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
 1590         if ((jobref == -1) || (jobref == 0))
 1591                 return EINVAL;
 1592 
 1593         for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
 1594                 cb;
 1595                 cb = TAILQ_NEXT(cb, plist)) {
 1596 
 1597                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
 1598                         p->p_retval[0] = cb->uaiocb._aiocb_private.error;
 1599                         return 0;
 1600                 }
 1601         }
 1602 
 1603         for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
 1604                 cb;
 1605                 cb = TAILQ_NEXT(cb, plist)) {
 1606 
 1607                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
 1608                         p->p_retval[0] = EINPROGRESS;
 1609                         return 0;
 1610                 }
 1611         }
 1612 
 1613         s = splbio();
 1614         for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
 1615                 cb;
 1616                 cb = TAILQ_NEXT(cb, plist)) {
 1617                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
 1618                         p->p_retval[0] = cb->uaiocb._aiocb_private.error;
 1619                         splx(s);
 1620                         return 0;
 1621                 }
 1622         }
 1623 
 1624         for (cb = TAILQ_FIRST(&ki->kaio_bufqueue);
 1625                 cb;
 1626                 cb = TAILQ_NEXT(cb, plist)) {
 1627                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
 1628                         p->p_retval[0] = EINPROGRESS;
 1629                         splx(s);
 1630                         return 0;
 1631                 }
 1632         }
 1633         splx(s);
 1634 
 1635 
 1636         /*
 1637          * Hack for lio
 1638          */
 1639 /*
 1640         status = fuword(&uap->aiocbp->_aiocb_private.status);
 1641         if (status == -1) {
 1642                 return fuword(&uap->aiocbp->_aiocb_private.error);
 1643         }
 1644 */
 1645         return EINVAL;
 1646 }
 1647 
 1648 int
 1649 aio_read(struct proc *p, struct aio_read_args *uap)
 1650 {
 1651         struct filedesc *fdp;
 1652         struct file *fp;
 1653         struct uio auio;
 1654         struct iovec aiov;
 1655         unsigned int fd;
 1656         int cnt;
 1657         struct aiocb iocb;
 1658         int error, pmodes;
 1659 
 1660         pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
 1661         if ((pmodes & AIO_PMODE_SYNC) == 0) {
 1662                 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
 1663         }
 1664 
 1665         /*
 1666          * Get control block
 1667          */
 1668         if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
 1669                 return error;
 1670 
 1671         /*
 1672          * Get the fd info for process
 1673          */
 1674         fdp = p->p_fd;
 1675 
 1676         /*
 1677          * Range check file descriptor
 1678          */
 1679         fd = iocb.aio_fildes;
 1680         if (fd >= fdp->fd_nfiles)
 1681                 return EBADF;
 1682         fp = fdp->fd_ofiles[fd];
 1683         if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
 1684                 return EBADF;
 1685         if (iocb.aio_offset == -1LL)
 1686                 return EINVAL;
 1687 
 1688         auio.uio_resid = iocb.aio_nbytes;
 1689         if (auio.uio_resid < 0)
 1690                 return (EINVAL);
 1691 
 1692         /*
 1693          * Process sync simply -- queue async request.
 1694          */
 1695         if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
 1696                 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
 1697         }
 1698 
 1699         aiov.iov_base = (void *) iocb.aio_buf;
 1700         aiov.iov_len = iocb.aio_nbytes;
 1701 
 1702         auio.uio_iov = &aiov;
 1703         auio.uio_iovcnt = 1;
 1704         auio.uio_offset = iocb.aio_offset;
 1705         auio.uio_rw = UIO_READ;
 1706         auio.uio_segflg = UIO_USERSPACE;
 1707         auio.uio_procp = p;
 1708 
 1709         cnt = iocb.aio_nbytes;
 1710         error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, FOF_OFFSET);
 1711         if (error &&
 1712                 (auio.uio_resid != cnt) &&
 1713                 (error == ERESTART || error == EINTR || error == EWOULDBLOCK))
 1714                         error = 0;
 1715         cnt -= auio.uio_resid;
 1716         p->p_retval[0] = cnt;
 1717         return error;
 1718 }
 1719 
 1720 int
 1721 aio_write(struct proc *p, struct aio_write_args *uap)
 1722 {
 1723         struct filedesc *fdp;
 1724         struct file *fp;
 1725         struct uio auio;
 1726         struct iovec aiov;
 1727         unsigned int fd;
 1728         int cnt;
 1729         struct aiocb iocb;
 1730         int error;
 1731         int pmodes;
 1732 
 1733         /*
 1734          * Process sync simply -- queue async request.
 1735          */
 1736         pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
 1737         if ((pmodes & AIO_PMODE_SYNC) == 0) {
 1738                 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
 1739         }
 1740 
 1741         if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
 1742                 return error;
 1743 
 1744         /*
 1745          * Get the fd info for process
 1746          */
 1747         fdp = p->p_fd;
 1748 
 1749         /*
 1750          * Range check file descriptor
 1751          */
 1752         fd = iocb.aio_fildes;
 1753         if (fd >= fdp->fd_nfiles)
 1754                 return EBADF;
 1755         fp = fdp->fd_ofiles[fd];
 1756         if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
 1757                 return EBADF;
 1758         if (iocb.aio_offset == -1LL)
 1759                 return EINVAL;
 1760 
 1761         aiov.iov_base = (void *) iocb.aio_buf;
 1762         aiov.iov_len = iocb.aio_nbytes;
 1763         auio.uio_iov = &aiov;
 1764         auio.uio_iovcnt = 1;
 1765         auio.uio_offset = iocb.aio_offset;
 1766 
 1767         auio.uio_resid = iocb.aio_nbytes;
 1768         if (auio.uio_resid < 0)
 1769                 return (EINVAL);
 1770 
 1771         auio.uio_rw = UIO_WRITE;
 1772         auio.uio_segflg = UIO_USERSPACE;
 1773         auio.uio_procp = p;
 1774 
 1775         cnt = iocb.aio_nbytes;
 1776         error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, FOF_OFFSET);
 1777         if (error) {
 1778                 if (auio.uio_resid != cnt) {
 1779                         if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 1780                                 error = 0;
 1781                         if (error == EPIPE)
 1782                                 psignal(p, SIGPIPE);
 1783                 }
 1784         }
 1785         cnt -= auio.uio_resid;
 1786         p->p_retval[0] = cnt;
 1787         return error;
 1788 }
 1789 
 1790 int
 1791 lio_listio(struct proc *p, struct lio_listio_args *uap)
 1792 {
 1793         int nent, nentqueued;
 1794         struct aiocb *iocb, * const *cbptr;
 1795         struct aiocblist *cb;
 1796         struct kaioinfo *ki;
 1797         struct aio_liojob *lj;
 1798         int error, runningcode;
 1799         int nerror;
 1800         int i;
 1801         int s;
 1802 
 1803         if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) {
 1804                 return EINVAL;
 1805         }
 1806 
 1807         nent = uap->nent;
 1808         if (nent > AIO_LISTIO_MAX) {
 1809                 return EINVAL;
 1810         }
 1811 
 1812         if (p->p_aioinfo == NULL) {
 1813                 aio_init_aioinfo(p);
 1814         }
 1815 
 1816         if ((nent + num_queue_count) > max_queue_count) {
 1817                 return EAGAIN;
 1818         }
 1819 
 1820         ki = p->p_aioinfo;
 1821         if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) {
 1822                 return EAGAIN;
 1823         }
 1824 
 1825         lj = zalloc(aiolio_zone);
 1826         if (!lj) {
 1827                 return EAGAIN;
 1828         }
 1829 
 1830         lj->lioj_flags = 0;
 1831         lj->lioj_buffer_count = 0;
 1832         lj->lioj_buffer_finished_count = 0;
 1833         lj->lioj_queue_count = 0;
 1834         lj->lioj_queue_finished_count = 0;
 1835         lj->lioj_ki = ki;
 1836         TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 1837 
 1838         /*
 1839          * Setup signal
 1840          */
 1841         if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 1842                 error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal);
 1843                 if (error)
 1844                         return error;
 1845                 lj->lioj_flags |= LIOJ_SIGNAL;
 1846                 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
 1847         } else {
 1848                 lj->lioj_flags &= ~LIOJ_SIGNAL;
 1849         }
 1850 
 1851 /*
 1852  * get pointers to the list of I/O requests
 1853  */
 1854 
 1855         nerror = 0;
 1856         nentqueued = 0;
 1857         cbptr = uap->acb_list;
 1858         for(i = 0; i < uap->nent; i++) {
 1859                 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
 1860                 if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) {
 1861                         error = _aio_aqueue(p, iocb, lj, 0);
 1862                         if (error == 0) {
 1863                                 nentqueued++;
 1864                         } else {
 1865                                 nerror++;
 1866                         }
 1867                 }
 1868         }
 1869 
 1870         /*
 1871          * If we haven't queued any, then just return error
 1872          */
 1873         if (nentqueued == 0) {
 1874                 return 0;
 1875         }
 1876 
 1877         /*
 1878          * Calculate the appropriate error return
 1879          */
 1880         runningcode = 0;
 1881         if (nerror)
 1882                 runningcode = EIO;
 1883 
 1884         if (uap->mode == LIO_WAIT) {
 1885                 while (1) {
 1886                         int found;
 1887                         found = 0;
 1888                         for(i = 0; i < uap->nent; i++) {
 1889                                 int jobref, command;
 1890 
 1891                                 /*
 1892                                  * Fetch address of the control buf pointer in user space
 1893                                  */
 1894                                 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
 1895                                 if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0))
 1896                                         continue;
 1897 
 1898                                 /*
 1899                                  * Fetch the associated command from user space
 1900                                  */
 1901                                 command = fuword(&iocb->aio_lio_opcode);
 1902                                 if (command == LIO_NOP) {
 1903                                         found++;
 1904                                         continue;
 1905                                 }
 1906 
 1907                                 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
 1908 
 1909                                 for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
 1910                                         cb;
 1911                                         cb = TAILQ_NEXT(cb, plist)) {
 1912                                         if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
 1913                                                 jobref) {
 1914                                                 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 1915                                                         curproc->p_stats->p_ru.ru_oublock +=
 1916                                                                 cb->outputcharge;
 1917                                                         cb->outputcharge = 0;
 1918                                                 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 1919                                                         curproc->p_stats->p_ru.ru_inblock +=
 1920                                                                 cb->inputcharge;
 1921                                                         cb->inputcharge = 0;
 1922                                                 }
 1923                                                 found++;
 1924                                                 break;
 1925                                         }
 1926                                 }
 1927 
 1928                                 s = splbio();
 1929                                 for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
 1930                                         cb;
 1931                                         cb = TAILQ_NEXT(cb, plist)) {
 1932                                         if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
 1933                                                 jobref) {
 1934                                                 found++;
 1935                                                 break;
 1936                                         }
 1937                                 }
 1938                                 splx(s);
 1939                                 
 1940                         }
 1941 
 1942                         /*
 1943                          * If all I/Os have been disposed of, then we can return
 1944                          */
 1945                         if (found == nentqueued) {
 1946                                 return runningcode;
 1947                         }
 1948                         
 1949                         ki->kaio_flags |= KAIO_WAKEUP;
 1950                         error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
 1951 
 1952                         if (error == EINTR) {
 1953                                 return EINTR;
 1954                         } else if (error == EWOULDBLOCK) {
 1955                                 return EAGAIN;
 1956                         }
 1957 
 1958                 }
 1959         }
 1960 
 1961         return runningcode;
 1962 }
 1963 
 1964 /*
 1965  * This is a wierd hack so that we can post a signal.  It is safe
 1966  * to do so from a timeout routine, but *not* from an interrupt routine.
 1967  */
 1968 static void
 1969 process_signal(void *ljarg)
 1970 {
 1971         struct aio_liojob *lj = ljarg;
 1972         if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) {
 1973                 if (lj->lioj_queue_count == lj->lioj_queue_finished_count) {
 1974                         psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
 1975                         lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 1976                 }
 1977         }
 1978 }
 1979 
 1980 /*
 1981  * Interrupt handler for physio, performs the necessary process wakeups,
 1982  * and signals.
 1983  */
 1984 static void
 1985 aio_physwakeup(bp)
 1986         struct buf *bp;
 1987 {
 1988         struct aiocblist *aiocbe;
 1989         struct proc *p;
 1990         struct kaioinfo *ki;
 1991         struct aio_liojob *lj;
 1992         int s;
 1993         s = splbio();
 1994 
 1995         wakeup((caddr_t) bp);
 1996         bp->b_flags &= ~B_CALL;
 1997         bp->b_flags |= B_DONE;
 1998 
 1999         aiocbe = (struct aiocblist *)bp->b_spc;
 2000         if (aiocbe) {
 2001                 p = bp->b_proc;
 2002 
 2003                 aiocbe->jobstate = JOBST_JOBBFINISHED;
 2004                 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
 2005                 aiocbe->uaiocb._aiocb_private.error = 0;
 2006                 aiocbe->jobflags |= AIOCBLIST_DONE;
 2007 
 2008                 if (bp->b_flags & B_ERROR) {
 2009                         aiocbe->uaiocb._aiocb_private.error = bp->b_error;
 2010                 }
 2011 
 2012                 lj = aiocbe->lio;
 2013                 if (lj) {
 2014                         lj->lioj_buffer_finished_count++;
 2015                         /*
 2016                          * wakeup/signal if all of the interrupt jobs are done
 2017                          */
 2018                         if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) {
 2019                                 /*
 2020                                  * post a signal if it is called for
 2021                                  */
 2022                                 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
 2023                                         LIOJ_SIGNAL) {
 2024                                         lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 2025                                         timeout(process_signal, lj, 0);
 2026                                 }
 2027                         }
 2028                 }
 2029 
 2030                 ki = p->p_aioinfo;
 2031                 if (ki) {
 2032                         ki->kaio_buffer_finished_count++;
 2033                         TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
 2034                         TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
 2035                         TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
 2036                         /*
 2037                          * and do the wakeup
 2038                          */
 2039                         if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
 2040                                 ki->kaio_flags &= ~KAIO_WAKEUP;
 2041                                 wakeup(p);
 2042                         }
 2043                 }
 2044         }
 2045         splx(s);
 2046 }

Cache object: 7bb4b6469b58de6c3df318460065a756


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.