FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_aio.c
1 /*
2 * Copyright (c) 1997 John S. Dyson. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 * derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author. This software is distributed AS-IS.
15 *
16 * $FreeBSD$
17 */
18
19 /*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23 #include <sys/param.h>
24 #include <sys/systm.h>
25 #include <sys/sysproto.h>
26 #include <sys/filedesc.h>
27 #include <sys/kernel.h>
28 #include <sys/fcntl.h>
29 #include <sys/file.h>
30 #include <sys/lock.h>
31 #include <sys/unistd.h>
32 #include <sys/proc.h>
33 #include <sys/resourcevar.h>
34 #include <sys/signalvar.h>
35 #include <sys/sysctl.h>
36 #include <sys/vnode.h>
37 #include <sys/conf.h>
38 #include <miscfs/specfs/specdev.h>
39
40 #include <vm/vm.h>
41 #include <vm/vm_param.h>
42 #include <vm/vm_extern.h>
43 #include <vm/pmap.h>
44 #include <vm/vm_map.h>
45 #include <vm/vm_zone.h>
46 #include <sys/aio.h>
47 #include <sys/shm.h>
48
49 #include <machine/cpu.h>
50 #include <machine/limits.h>
51
52 static long jobrefid;
53
54 #define JOBST_NULL 0x0
55 #define JOBST_JOBQPROC 0x1
56 #define JOBST_JOBQGLOBAL 0x2
57 #define JOBST_JOBRUNNING 0x3
58 #define JOBST_JOBFINISHED 0x4
59 #define JOBST_JOBQBUF 0x5
60 #define JOBST_JOBBFINISHED 0x6
61
62 #ifndef MAX_AIO_PER_PROC
63 #define MAX_AIO_PER_PROC 32
64 #endif
65
66 #ifndef MAX_AIO_QUEUE_PER_PROC
67 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
68 #endif
69
70 #ifndef MAX_AIO_PROCS
71 #define MAX_AIO_PROCS 32
72 #endif
73
74 #ifndef MAX_AIO_QUEUE
75 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
76 #endif
77
78 #ifndef TARGET_AIO_PROCS
79 #define TARGET_AIO_PROCS 0
80 #endif
81
82 #ifndef MAX_BUF_AIO
83 #define MAX_BUF_AIO 16
84 #endif
85
86 #ifndef AIOD_TIMEOUT_DEFAULT
87 #define AIOD_TIMEOUT_DEFAULT (10 * hz)
88 #endif
89
90 #ifndef AIOD_LIFETIME_DEFAULT
91 #define AIOD_LIFETIME_DEFAULT (30 * hz)
92 #endif
93
94 static int max_aio_procs = MAX_AIO_PROCS;
95 static int num_aio_procs = 0;
96 static int target_aio_procs = TARGET_AIO_PROCS;
97 static int max_queue_count = MAX_AIO_QUEUE;
98 static int num_queue_count = 0;
99 static int num_buf_aio = 0;
100 static int num_aio_resv_start = 0;
101 static int aiod_timeout;
102 static int aiod_lifetime;
103
104 static int max_aio_per_proc = MAX_AIO_PER_PROC,
105 max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC;
106
107 static int max_buf_aio = MAX_BUF_AIO;
108
109 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
110
111 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
112 CTLFLAG_RW, &max_aio_per_proc, 0, "");
113
114 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
115 CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
116
117 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
118 CTLFLAG_RW, &max_aio_procs, 0, "");
119
120 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
121 CTLFLAG_RD, &num_aio_procs, 0, "");
122
123 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
124 CTLFLAG_RD, &num_queue_count, 0, "");
125
126 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
127 CTLFLAG_RW, &max_queue_count, 0, "");
128
129 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
130 CTLFLAG_RW, &target_aio_procs, 0, "");
131
132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
133 CTLFLAG_RW, &max_buf_aio, 0, "");
134
135 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
136 CTLFLAG_RD, &num_buf_aio, 0, "");
137
138 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
139 CTLFLAG_RW, &aiod_lifetime, 0, "");
140
141 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
142 CTLFLAG_RW, &aiod_timeout, 0, "");
143
144
145 /*
146 * Job queue item
147 */
148
149 #define AIOCBLIST_CANCELLED 0x1
150 #define AIOCBLIST_RUNDOWN 0x4
151 #define AIOCBLIST_ASYNCFREE 0x8
152 #define AIOCBLIST_DONE 0x10
153
154 struct aiocblist {
155 TAILQ_ENTRY (aiocblist) list; /* List of jobs */
156 TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */
157 int jobflags;
158 int jobstate;
159 int inputcharge, outputcharge;
160 struct buf *bp; /* buffer pointer */
161 struct proc *userproc; /* User process */
162 struct aioproclist *jobaioproc; /* AIO process descriptor */
163 struct aio_liojob *lio; /* optional lio job */
164 struct aiocb *uuaiocb; /* pointer in userspace of aiocb */
165 struct aiocb uaiocb; /* Kernel I/O control block */
166 };
167
168
169 /*
170 * AIO process info
171 */
172 #define AIOP_FREE 0x1 /* proc on free queue */
173 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */
174
175 struct aioproclist {
176 int aioprocflags; /* AIO proc flags */
177 TAILQ_ENTRY(aioproclist) list; /* List of processes */
178 struct proc *aioproc; /* The AIO thread */
179 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */
180 };
181
182 /*
183 * data-structure for lio signal management
184 */
185 struct aio_liojob {
186 int lioj_flags;
187 int lioj_buffer_count;
188 int lioj_buffer_finished_count;
189 int lioj_queue_count;
190 int lioj_queue_finished_count;
191 struct sigevent lioj_signal; /* signal on all I/O done */
192 TAILQ_ENTRY (aio_liojob) lioj_list;
193 struct kaioinfo *lioj_ki;
194 };
195 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
196 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
197
198 /*
199 * per process aio data structure
200 */
201 struct kaioinfo {
202 int kaio_flags; /* per process kaio flags */
203 int kaio_maxactive_count; /* maximum number of AIOs */
204 int kaio_active_count; /* number of currently used AIOs */
205 int kaio_qallowed_count; /* maxiumu size of AIO queue */
206 int kaio_queue_count; /* size of AIO queue */
207 int kaio_ballowed_count; /* maximum number of buffers */
208 int kaio_queue_finished_count; /* number of daemon jobs finished */
209 int kaio_buffer_count; /* number of physio buffers */
210 int kaio_buffer_finished_count; /* count of I/O done */
211 struct proc *kaio_p; /* process that uses this kaio block */
212 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
213 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */
214 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */
215 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */
216 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */
217 };
218
219 #define KAIO_RUNDOWN 0x1 /* process is being run down */
220 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant
221 event */
222
223
224 static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
225 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
226 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
227 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
228
229 static void aio_init_aioinfo(struct proc *p) ;
230 static void aio_onceonly(void *) ;
231 static int aio_free_entry(struct aiocblist *aiocbe);
232 static void aio_process(struct aiocblist *aiocbe);
233 static int aio_newproc(void) ;
234 static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
235 static void aio_physwakeup(struct buf *bp);
236 static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
237 static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
238 static void aio_daemon(void *uproc);
239
240 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
241
242 static vm_zone_t kaio_zone=0, aiop_zone=0,
243 aiocb_zone=0, aiol_zone=0, aiolio_zone=0;
244
245 /*
246 * Single AIOD vmspace shared amongst all of them
247 */
248 struct vmspace *aiovmspace = NULL;
249
250 /*
251 * Startup initialization
252 */
253 void
254 aio_onceonly(void *na)
255 {
256 TAILQ_INIT(&aio_freeproc);
257 TAILQ_INIT(&aio_activeproc);
258 TAILQ_INIT(&aio_jobs);
259 TAILQ_INIT(&aio_bufjobs);
260 TAILQ_INIT(&aio_freejobs);
261 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
262 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
263 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
264 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
265 aiolio_zone = zinit("AIOLIO",
266 AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1);
267 aiod_timeout = AIOD_TIMEOUT_DEFAULT;
268 aiod_lifetime = AIOD_LIFETIME_DEFAULT;
269 jobrefid = 1;
270 }
271
272 /*
273 * Init the per-process aioinfo structure.
274 * The aioinfo limits are set per-process for user limit (resource) management.
275 */
276 void
277 aio_init_aioinfo(struct proc *p)
278 {
279 struct kaioinfo *ki;
280 if (p->p_aioinfo == NULL) {
281 ki = zalloc(kaio_zone);
282 p->p_aioinfo = ki;
283 ki->kaio_flags = 0;
284 ki->kaio_maxactive_count = max_aio_per_proc;
285 ki->kaio_active_count = 0;
286 ki->kaio_qallowed_count = max_aio_queue_per_proc;
287 ki->kaio_queue_count = 0;
288 ki->kaio_ballowed_count = max_buf_aio;
289 ki->kaio_buffer_count = 0;
290 ki->kaio_buffer_finished_count = 0;
291 ki->kaio_p = p;
292 TAILQ_INIT(&ki->kaio_jobdone);
293 TAILQ_INIT(&ki->kaio_jobqueue);
294 TAILQ_INIT(&ki->kaio_bufdone);
295 TAILQ_INIT(&ki->kaio_bufqueue);
296 TAILQ_INIT(&ki->kaio_liojoblist);
297 }
298 }
299
300 /*
301 * Free a job entry. Wait for completion if it is currently
302 * active, but don't delay forever. If we delay, we return
303 * a flag that says that we have to restart the queue scan.
304 */
305 int
306 aio_free_entry(struct aiocblist *aiocbe)
307 {
308 struct kaioinfo *ki;
309 struct aioproclist *aiop;
310 struct aio_liojob *lj;
311 struct proc *p;
312 int error;
313 int s;
314
315 if (aiocbe->jobstate == JOBST_NULL)
316 panic("aio_free_entry: freeing already free job");
317
318 p = aiocbe->userproc;
319 ki = p->p_aioinfo;
320 lj = aiocbe->lio;
321 if (ki == NULL)
322 panic("aio_free_entry: missing p->p_aioinfo");
323
324 if (aiocbe->jobstate == JOBST_JOBRUNNING) {
325 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
326 return 0;
327 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
328 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
329 }
330 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
331
332 if (aiocbe->bp == NULL) {
333 if (ki->kaio_queue_count <= 0)
334 panic("aio_free_entry: process queue size <= 0");
335 if (num_queue_count <= 0)
336 panic("aio_free_entry: system wide queue size <= 0");
337
338 if(lj) {
339 lj->lioj_queue_count--;
340 if (aiocbe->jobflags & AIOCBLIST_DONE)
341 lj->lioj_queue_finished_count--;
342 }
343 ki->kaio_queue_count--;
344 if (aiocbe->jobflags & AIOCBLIST_DONE)
345 ki->kaio_queue_finished_count--;
346 num_queue_count--;
347
348 } else {
349 if(lj) {
350 lj->lioj_buffer_count--;
351 if (aiocbe->jobflags & AIOCBLIST_DONE)
352 lj->lioj_buffer_finished_count--;
353 }
354 if (aiocbe->jobflags & AIOCBLIST_DONE)
355 ki->kaio_buffer_finished_count--;
356 ki->kaio_buffer_count--;
357 num_buf_aio--;
358
359 }
360
361 if ((ki->kaio_flags & KAIO_WAKEUP) ||
362 (ki->kaio_flags & KAIO_RUNDOWN) &&
363 ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) {
364 ki->kaio_flags &= ~KAIO_WAKEUP;
365 wakeup(p);
366 }
367
368 if ( aiocbe->jobstate == JOBST_JOBQBUF) {
369 if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
370 return error;
371 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
372 panic("aio_free_entry: invalid physio finish-up state");
373 s = splbio();
374 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
375 splx(s);
376 } else if ( aiocbe->jobstate == JOBST_JOBQPROC) {
377 aiop = aiocbe->jobaioproc;
378 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
379 } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
380 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
381 } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
382 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
383 } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) {
384 s = splbio();
385 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
386 splx(s);
387 if (aiocbe->bp) {
388 vunmapbuf(aiocbe->bp);
389 relpbuf(aiocbe->bp);
390 aiocbe->bp = NULL;
391 }
392 }
393 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
394 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
395 zfree(aiolio_zone, lj);
396 }
397 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
398 aiocbe->jobstate = JOBST_NULL;
399 return 0;
400 }
401
402 /*
403 * Rundown the jobs for a given process.
404 */
405 void
406 aio_proc_rundown(struct proc *p)
407 {
408 int s;
409 struct kaioinfo *ki;
410 struct aio_liojob *lj, *ljn;
411 struct aiocblist *aiocbe, *aiocbn;
412
413 ki = p->p_aioinfo;
414 if (ki == NULL)
415 return;
416
417 ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
418 while ((ki->kaio_active_count > 0) ||
419 (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) {
420 ki->kaio_flags |= KAIO_RUNDOWN;
421 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
422 break;
423 }
424
425 restart1:
426 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
427 aiocbe;
428 aiocbe = aiocbn) {
429 aiocbn = TAILQ_NEXT(aiocbe, plist);
430 if (aio_free_entry(aiocbe))
431 goto restart1;
432 }
433
434 restart2:
435 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
436 aiocbe;
437 aiocbe = aiocbn) {
438 aiocbn = TAILQ_NEXT(aiocbe, plist);
439 if (aio_free_entry(aiocbe))
440 goto restart2;
441 }
442
443 /*
444 * Note the use of lots of splbio here, trying to avoid
445 * splbio for long chains of I/O. Probably unnecessary.
446 */
447
448 restart3:
449 s = splbio();
450 while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
451 ki->kaio_flags |= KAIO_WAKEUP;
452 tsleep (p, PRIBIO, "aioprn", 0);
453 splx(s);
454 goto restart3;
455 }
456 splx(s);
457
458 restart4:
459 s = splbio();
460 for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone);
461 aiocbe;
462 aiocbe = aiocbn) {
463 aiocbn = TAILQ_NEXT(aiocbe, plist);
464 if (aio_free_entry(aiocbe)) {
465 splx(s);
466 goto restart4;
467 }
468 }
469 splx(s);
470
471 for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist);
472 lj;
473 lj = ljn) {
474 ljn = TAILQ_NEXT(lj, lioj_list);
475 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
476 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
477 zfree(aiolio_zone, lj);
478 } else {
479 #if defined(DIAGNOSTIC)
480 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n",
481 lj->lioj_buffer_count, lj->lioj_buffer_finished_count,
482 lj->lioj_queue_count, lj->lioj_queue_finished_count);
483 #endif
484 }
485 }
486
487 zfree(kaio_zone, ki);
488 p->p_aioinfo = NULL;
489 }
490
491 /*
492 * Select a job to run (called by an AIO daemon)
493 */
494 static struct aiocblist *
495 aio_selectjob(struct aioproclist *aiop)
496 {
497
498 struct aiocblist *aiocbe;
499
500 aiocbe = TAILQ_FIRST(&aiop->jobtorun);
501 if (aiocbe) {
502 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
503 return aiocbe;
504 }
505
506 for (aiocbe = TAILQ_FIRST(&aio_jobs);
507 aiocbe;
508 aiocbe = TAILQ_NEXT(aiocbe, list)) {
509 struct kaioinfo *ki;
510 struct proc *userp;
511
512 userp = aiocbe->userproc;
513 ki = userp->p_aioinfo;
514
515 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
516 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
517 return aiocbe;
518 }
519 }
520
521 return NULL;
522 }
523
524 /*
525 * The AIO processing activity. This is the code that does the
526 * I/O request for the non-physio version of the operations. The
527 * normal vn operations are used, and this code should work in
528 * all instances for every type of file, including pipes, sockets,
529 * fifos, and regular files.
530 */
531 void
532 aio_process(struct aiocblist *aiocbe)
533 {
534 struct filedesc *fdp;
535 struct proc *userp, *mycp;
536 struct aiocb *cb;
537 struct file *fp;
538 struct uio auio;
539 struct iovec aiov;
540 unsigned int fd;
541 int cnt;
542 int error;
543 off_t offset;
544 int oublock_st, oublock_end;
545 int inblock_st, inblock_end;
546
547 userp = aiocbe->userproc;
548 cb = &aiocbe->uaiocb;
549
550 mycp = curproc;
551
552 fdp = mycp->p_fd;
553 fd = cb->aio_fildes;
554 fp = fdp->fd_ofiles[fd];
555
556 aiov.iov_base = (void *) cb->aio_buf;
557 aiov.iov_len = cb->aio_nbytes;
558
559 auio.uio_iov = &aiov;
560 auio.uio_iovcnt = 1;
561 auio.uio_offset = offset = cb->aio_offset;
562 auio.uio_resid = cb->aio_nbytes;
563 cnt = cb->aio_nbytes;
564 auio.uio_segflg = UIO_USERSPACE;
565 auio.uio_procp = mycp;
566
567 inblock_st = mycp->p_stats->p_ru.ru_inblock;
568 oublock_st = mycp->p_stats->p_ru.ru_oublock;
569 if (cb->aio_lio_opcode == LIO_READ) {
570 auio.uio_rw = UIO_READ;
571 error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, FOF_OFFSET);
572 } else {
573 auio.uio_rw = UIO_WRITE;
574 error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, FOF_OFFSET);
575 }
576 inblock_end = mycp->p_stats->p_ru.ru_inblock;
577 oublock_end = mycp->p_stats->p_ru.ru_oublock;
578
579 aiocbe->inputcharge = inblock_end - inblock_st;
580 aiocbe->outputcharge = oublock_end - oublock_st;
581
582 if (error) {
583 if (auio.uio_resid != cnt) {
584 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
585 error = 0;
586 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
587 psignal(userp, SIGPIPE);
588 }
589 }
590
591 cnt -= auio.uio_resid;
592 cb->_aiocb_private.error = error;
593 cb->_aiocb_private.status = cnt;
594
595 return;
596
597 }
598
599 /*
600 * The AIO daemon, most of the actual work is done in aio_process,
601 * but the setup (and address space mgmt) is done in this routine.
602 */
603 static void
604 aio_daemon(void *uproc)
605 {
606 int s;
607 struct aioproclist *aiop;
608 struct vmspace *myvm, *aiovm;
609 struct proc *mycp;
610
611 /*
612 * Local copies of curproc (cp) and vmspace (myvm)
613 */
614 mycp = curproc;
615 myvm = mycp->p_vmspace;
616
617 /*
618 * We manage to create only one VM space for all AIOD processes.
619 * The VM space for the first AIOD created becomes the shared VM
620 * space for all of them. We add an additional reference count,
621 * even for the first AIOD, so the address space does not go away,
622 * and we continue to use that original VM space even if the first
623 * AIOD exits.
624 */
625 if ((aiovm = aiovmspace) == NULL) {
626 aiovmspace = myvm;
627 myvm->vm_refcnt++;
628 /*
629 * Remove userland cruft from address space.
630 */
631 if (myvm->vm_shm)
632 shmexit(mycp);
633 pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK);
634 vm_map_remove(&myvm->vm_map, 0, USRSTACK);
635 myvm->vm_tsize = 0;
636 myvm->vm_dsize = 0;
637 myvm->vm_ssize = 0;
638 } else {
639 aiovm->vm_refcnt++;
640 mycp->p_vmspace = aiovm;
641 pmap_activate(mycp);
642 vmspace_free(myvm);
643 myvm = aiovm;
644 }
645
646 if (mycp->p_textvp) {
647 vrele(mycp->p_textvp);
648 mycp->p_textvp = NULL;
649 }
650
651 /*
652 * Allocate and ready the aio control info. There is one
653 * aiop structure per daemon.
654 */
655 aiop = zalloc(aiop_zone);
656 aiop->aioproc = mycp;
657 aiop->aioprocflags |= AIOP_FREE;
658 TAILQ_INIT(&aiop->jobtorun);
659
660 /*
661 * Place thread (lightweight process) onto the AIO free thread list
662 */
663 if (TAILQ_EMPTY(&aio_freeproc))
664 wakeup(&aio_freeproc);
665 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
666
667 /*
668 * Make up a name for the daemon
669 */
670 strcpy(mycp->p_comm, "aiod");
671
672 /*
673 * Get rid of our current filedescriptors. AIOD's don't need any
674 * filedescriptors, except as temporarily inherited from the client.
675 * Credentials are also cloned, and made equivalent to "root."
676 */
677 fdfree(mycp);
678 mycp->p_fd = NULL;
679 mycp->p_ucred = crcopy(mycp->p_ucred);
680 mycp->p_ucred->cr_uid = 0;
681 mycp->p_ucred->cr_ngroups = 1;
682 mycp->p_ucred->cr_groups[0] = 1;
683
684 /*
685 * The daemon resides in its own pgrp.
686 */
687 enterpgrp(mycp, mycp->p_pid, 1);
688
689 /*
690 * Mark special process type
691 */
692 mycp->p_flag |= P_SYSTEM|P_KTHREADP;
693
694 /*
695 * Wakeup parent process. (Parent sleeps to keep from blasting away
696 * creating to many daemons.)
697 */
698 wakeup(mycp);
699
700 while(1) {
701 struct proc *curcp;
702 struct aiocblist *aiocbe;
703
704 /*
705 * curcp is the current daemon process context.
706 * userp is the current user process context.
707 */
708 curcp = mycp;
709
710 /*
711 * Take daemon off of free queue
712 */
713 if (aiop->aioprocflags & AIOP_FREE) {
714 TAILQ_REMOVE(&aio_freeproc, aiop, list);
715 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
716 aiop->aioprocflags &= ~AIOP_FREE;
717 }
718 aiop->aioprocflags &= ~AIOP_SCHED;
719
720 /*
721 * Check for jobs
722 */
723 while ( aiocbe = aio_selectjob(aiop)) {
724 struct proc *userp;
725 struct aiocb *cb;
726 struct kaioinfo *ki;
727 struct aio_liojob *lj;
728
729 cb = &aiocbe->uaiocb;
730 userp = aiocbe->userproc;
731
732 aiocbe->jobstate = JOBST_JOBRUNNING;
733
734 /*
735 * Connect to process address space for user program
736 */
737 if (userp != curcp) {
738 struct vmspace *tmpvm;
739 /*
740 * Save the current address space that we are connected to.
741 */
742 tmpvm = mycp->p_vmspace;
743 /*
744 * Point to the new user address space, and refer to it.
745 */
746 mycp->p_vmspace = userp->p_vmspace;
747 mycp->p_vmspace->vm_refcnt++;
748 /*
749 * Activate the new mapping.
750 */
751 pmap_activate(mycp);
752 /*
753 * If the old address space wasn't the daemons own address
754 * space, then we need to remove the daemon's reference from
755 * the other process that it was acting on behalf of.
756 */
757 if (tmpvm != myvm) {
758 vmspace_free(tmpvm);
759 }
760 /*
761 * Disassociate from previous clients file descriptors, and
762 * associate to the new clients descriptors. Note that
763 * the daemon doesn't need to worry about its orginal
764 * descriptors, because they were originally freed.
765 */
766 if (mycp->p_fd)
767 fdfree(mycp);
768 mycp->p_fd = fdshare(userp);
769 curcp = userp;
770 }
771
772 ki = userp->p_aioinfo;
773 lj = aiocbe->lio;
774
775 /*
776 * Account for currently active jobs
777 */
778 ki->kaio_active_count++;
779
780 /*
781 * Do the I/O function
782 */
783 aiocbe->jobaioproc = aiop;
784 aio_process(aiocbe);
785
786 /*
787 * decrement the active job count
788 */
789 ki->kaio_active_count--;
790
791 /*
792 * increment the completion count for wakeup/signal comparisons
793 */
794 aiocbe->jobflags |= AIOCBLIST_DONE;
795 ki->kaio_queue_finished_count++;
796 if (lj) {
797 lj->lioj_queue_finished_count++;
798 }
799 if ((ki->kaio_flags & KAIO_WAKEUP) ||
800 (ki->kaio_flags & KAIO_RUNDOWN) &&
801 (ki->kaio_active_count == 0)) {
802 ki->kaio_flags &= ~KAIO_WAKEUP;
803 wakeup(userp);
804 }
805
806 s = splbio();
807 if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
808 LIOJ_SIGNAL) {
809 if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) &&
810 (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) {
811 psignal(userp, lj->lioj_signal.sigev_signo);
812 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
813 }
814 }
815 splx(s);
816
817 aiocbe->jobstate = JOBST_JOBFINISHED;
818
819 /*
820 * If the I/O request should be automatically rundown, do the
821 * needed cleanup. Otherwise, place the queue entry for
822 * the just finished I/O request into the done queue for the
823 * associated client.
824 */
825 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
826 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
827 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
828 } else {
829 TAILQ_REMOVE(&ki->kaio_jobqueue,
830 aiocbe, plist);
831 TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
832 aiocbe, plist);
833 }
834
835 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
836 wakeup(aiocbe);
837 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
838 }
839
840 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
841 psignal(userp, cb->aio_sigevent.sigev_signo);
842 }
843 }
844
845 /*
846 * Disconnect from user address space
847 */
848 if (curcp != mycp) {
849 struct vmspace *tmpvm;
850 /*
851 * Get the user address space to disconnect from.
852 */
853 tmpvm = mycp->p_vmspace;
854 /*
855 * Get original address space for daemon.
856 */
857 mycp->p_vmspace = myvm;
858 /*
859 * Activate the daemon's address space.
860 */
861 pmap_activate(mycp);
862 #if defined(DIAGNOSTIC)
863 if (tmpvm == myvm)
864 printf("AIOD: vmspace problem -- %d\n", mycp->p_pid);
865 #endif
866 /*
867 * remove our vmspace reference.
868 */
869 vmspace_free(tmpvm);
870 /*
871 * disassociate from the user process's file descriptors.
872 */
873 if (mycp->p_fd)
874 fdfree(mycp);
875 mycp->p_fd = NULL;
876 curcp = mycp;
877 }
878
879 /*
880 * If we are the first to be put onto the free queue, wakeup
881 * anyone waiting for a daemon.
882 */
883 TAILQ_REMOVE(&aio_activeproc, aiop, list);
884 if (TAILQ_EMPTY(&aio_freeproc))
885 wakeup(&aio_freeproc);
886 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
887 aiop->aioprocflags |= AIOP_FREE;
888
889 /*
890 * If daemon is inactive for a long time, allow it to exit, thereby
891 * freeing resources.
892 */
893 if (((aiop->aioprocflags & AIOP_SCHED) == 0) &&
894 tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) {
895 if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
896 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
897 if ((aiop->aioprocflags & AIOP_FREE) &&
898 (num_aio_procs > target_aio_procs)) {
899 TAILQ_REMOVE(&aio_freeproc, aiop, list);
900 zfree(aiop_zone, aiop);
901 num_aio_procs--;
902 #if defined(DIAGNOSTIC)
903 if (mycp->p_vmspace->vm_refcnt <= 1)
904 printf("AIOD: bad vm refcnt for exiting daemon: %d\n",
905 mycp->p_vmspace->vm_refcnt);
906 #endif
907 exit1(mycp, 0);
908 }
909 }
910 }
911 }
912 }
913
914 /*
915 * Create a new AIO daemon. This is mostly a kernel-thread fork routine.
916 * The AIO daemon modifies its environment itself.
917 */
918 static int
919 aio_newproc()
920 {
921 int error;
922 struct rfork_args rfa;
923 struct proc *p, *np;
924
925 rfa.flags = RFPROC | RFCFDG;
926
927 p = curproc;
928 if (error = rfork(p, &rfa))
929 return error;
930
931 np = pfind(p->p_retval[0]);
932 cpu_set_fork_handler(np, aio_daemon, p);
933
934 /*
935 * Wait until daemon is started, but continue on just in case (to
936 * handle error conditions.
937 */
938 error = tsleep(np, PZERO, "aiosta", aiod_timeout);
939 num_aio_procs++;
940
941 return error;
942
943 }
944
945 /*
946 * Try the high-performance physio method for eligible VCHR devices. This
947 * routine doesn't require the use of any additional threads, and have
948 * overhead.
949 */
950 int
951 aio_qphysio(p, aiocbe)
952 struct proc *p;
953 struct aiocblist *aiocbe;
954 {
955 int error;
956 struct aiocb *cb;
957 struct file *fp;
958 struct buf *bp;
959 int bflags;
960 struct vnode *vp;
961 struct kaioinfo *ki;
962 struct filedesc *fdp;
963 struct aio_liojob *lj;
964 int fd;
965 int majordev;
966 int s;
967 int cnt;
968 dev_t dev;
969 int rw;
970 d_strategy_t *fstrategy;
971 struct cdevsw *cdev;
972 struct cdevsw *bdev;
973
974 cb = &aiocbe->uaiocb;
975 fdp = p->p_fd;
976 fd = cb->aio_fildes;
977 fp = fdp->fd_ofiles[fd];
978
979 if (fp->f_type != DTYPE_VNODE) {
980 return -1;
981 }
982
983 vp = (struct vnode *)fp->f_data;
984 if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) {
985 return -1;
986 }
987
988 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) {
989 return -1;
990 }
991
992 if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) {
993 return -1;
994 }
995
996 majordev = major(vp->v_rdev);
997 if (majordev == NODEV) {
998 return -1;
999 }
1000
1001 cdev = cdevsw[major(vp->v_rdev)];
1002 if (cdev == NULL) {
1003 return -1;
1004 }
1005
1006 if (cdev->d_bmaj == -1) {
1007 return -1;
1008 }
1009 bdev = cdev;
1010
1011 ki = p->p_aioinfo;
1012 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
1013 return -1;
1014 }
1015
1016 cnt = cb->aio_nbytes;
1017 if (cnt > MAXPHYS) {
1018 return -1;
1019 }
1020
1021 dev = makedev(bdev->d_bmaj, minor(vp->v_rdev));
1022
1023 /*
1024 * Physical I/O is charged directly to the process, so we don't have
1025 * to fake it.
1026 */
1027 aiocbe->inputcharge = 0;
1028 aiocbe->outputcharge = 0;
1029
1030 ki->kaio_buffer_count++;
1031
1032 lj = aiocbe->lio;
1033 if (lj) {
1034 lj->lioj_buffer_count++;
1035 }
1036
1037 /* create and build a buffer header for a transfer */
1038 bp = (struct buf *)getpbuf();
1039
1040 /*
1041 * get a copy of the kva from the physical buffer
1042 */
1043 bp->b_proc = p;
1044 bp->b_dev = dev;
1045 error = bp->b_error = 0;
1046
1047 if (cb->aio_lio_opcode == LIO_WRITE) {
1048 rw = 0;
1049 bflags = B_WRITE;
1050 } else {
1051 rw = 1;
1052 bflags = B_READ;
1053 }
1054
1055 bp->b_bcount = cb->aio_nbytes;
1056 bp->b_bufsize = cb->aio_nbytes;
1057 bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags;
1058 bp->b_iodone = aio_physwakeup;
1059 bp->b_saveaddr = bp->b_data;
1060 bp->b_data = (void *) cb->aio_buf;
1061 bp->b_blkno = btodb(cb->aio_offset);
1062
1063 if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
1064 error = EFAULT;
1065 goto doerror;
1066 }
1067 if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
1068 error = EFAULT;
1069 goto doerror;
1070 }
1071
1072 /* bring buffer into kernel space */
1073 vmapbuf(bp);
1074
1075 s = splbio();
1076 aiocbe->bp = bp;
1077 bp->b_spc = (void *)aiocbe;
1078 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1079 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1080 aiocbe->jobstate = JOBST_JOBQBUF;
1081 cb->_aiocb_private.status = cb->aio_nbytes;
1082 num_buf_aio++;
1083 fstrategy = bdev->d_strategy;
1084 bp->b_error = 0;
1085
1086 splx(s);
1087 /* perform transfer */
1088 (*fstrategy)(bp);
1089
1090 s = splbio();
1091 /*
1092 * If we had an error invoking the request, or an error in processing
1093 * the request before we have returned, we process it as an error
1094 * in transfer. Note that such an I/O error is not indicated immediately,
1095 * but is returned using the aio_error mechanism. In this case, aio_suspend
1096 * will return immediately.
1097 */
1098 if (bp->b_error || (bp->b_flags & B_ERROR)) {
1099 struct aiocb *job = aiocbe->uuaiocb;
1100
1101 aiocbe->uaiocb._aiocb_private.status = 0;
1102 suword(&job->_aiocb_private.status, 0);
1103 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1104 suword(&job->_aiocb_private.error, bp->b_error);
1105
1106 ki->kaio_buffer_finished_count++;
1107
1108 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1109 aiocbe->jobstate = JOBST_JOBBFINISHED;
1110 aiocbe->jobflags |= AIOCBLIST_DONE;
1111 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1112 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1113 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1114 }
1115 }
1116 splx(s);
1117 return 0;
1118
1119 doerror:
1120 ki->kaio_buffer_count--;
1121 if (lj) {
1122 lj->lioj_buffer_count--;
1123 }
1124 aiocbe->bp = NULL;
1125 relpbuf(bp);
1126 return error;
1127 }
1128
1129 /*
1130 * This waits/tests physio completion.
1131 */
1132 int
1133 aio_fphysio(p, iocb, flgwait)
1134 struct proc *p;
1135 struct aiocblist *iocb;
1136 int flgwait;
1137 {
1138 int s;
1139 struct buf *bp;
1140 int error;
1141
1142 bp = iocb->bp;
1143
1144 s = splbio();
1145 if (flgwait == 0) {
1146 if ((bp->b_flags & B_DONE) == 0) {
1147 splx(s);
1148 return EINPROGRESS;
1149 }
1150 }
1151
1152 while ((bp->b_flags & B_DONE) == 0) {
1153 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
1154 if ((bp->b_flags & B_DONE) == 0) {
1155 splx(s);
1156 return EINPROGRESS;
1157 } else {
1158 break;
1159 }
1160 }
1161 }
1162
1163 /* release mapping into kernel space */
1164 vunmapbuf(bp);
1165 iocb->bp = 0;
1166
1167 error = 0;
1168 /*
1169 * check for an error
1170 */
1171 if (bp->b_flags & B_ERROR) {
1172 error = bp->b_error;
1173 }
1174
1175 relpbuf(bp);
1176 return (error);
1177 }
1178
1179 /*
1180 * Queue a new AIO request. Choosing either the threaded or direct physio
1181 * VCHR technique is done in this code.
1182 */
1183 static int
1184 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1185 {
1186 struct filedesc *fdp;
1187 struct file *fp;
1188 unsigned int fd;
1189
1190 int error;
1191 int opcode;
1192 struct aiocblist *aiocbe;
1193 struct aioproclist *aiop;
1194 struct kaioinfo *ki;
1195
1196 if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
1197 TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1198 } else {
1199 aiocbe = zalloc (aiocb_zone);
1200 }
1201
1202 aiocbe->inputcharge = 0;
1203 aiocbe->outputcharge = 0;
1204
1205 suword(&job->_aiocb_private.status, -1);
1206 suword(&job->_aiocb_private.error, 0);
1207 suword(&job->_aiocb_private.kernelinfo, -1);
1208
1209 error = copyin((caddr_t)job,
1210 (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
1211 if (error) {
1212 suword(&job->_aiocb_private.error, error);
1213
1214 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1215 return error;
1216 }
1217
1218 /*
1219 * Save userspace address of the job info
1220 */
1221 aiocbe->uuaiocb = job;
1222
1223 /*
1224 * Get the opcode
1225 */
1226 if (type != LIO_NOP) {
1227 aiocbe->uaiocb.aio_lio_opcode = type;
1228 }
1229 opcode = aiocbe->uaiocb.aio_lio_opcode;
1230
1231 /*
1232 * Get the fd info for process
1233 */
1234 fdp = p->p_fd;
1235
1236 /*
1237 * Range check file descriptor
1238 */
1239 fd = aiocbe->uaiocb.aio_fildes;
1240 if (fd >= fdp->fd_nfiles) {
1241 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1242 if (type == 0) {
1243 suword(&job->_aiocb_private.error, EBADF);
1244 }
1245 return EBADF;
1246 }
1247
1248 fp = fdp->fd_ofiles[fd];
1249 if ((fp == NULL) ||
1250 ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) {
1251 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1252 if (type == 0) {
1253 suword(&job->_aiocb_private.error, EBADF);
1254 }
1255 return EBADF;
1256 }
1257
1258 if (aiocbe->uaiocb.aio_offset == -1LL) {
1259 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1260 if (type == 0) {
1261 suword(&job->_aiocb_private.error, EINVAL);
1262 }
1263 return EINVAL;
1264 }
1265
1266 error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1267 if (error) {
1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1269 if (type == 0) {
1270 suword(&job->_aiocb_private.error, EINVAL);
1271 }
1272 return error;
1273 }
1274
1275 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1276 if (jobrefid == LONG_MAX)
1277 jobrefid = 1;
1278 else
1279 jobrefid++;
1280
1281 if (opcode == LIO_NOP) {
1282 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1283 if (type == 0) {
1284 suword(&job->_aiocb_private.error, 0);
1285 suword(&job->_aiocb_private.status, 0);
1286 suword(&job->_aiocb_private.kernelinfo, 0);
1287 }
1288 return 0;
1289 }
1290
1291 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1292 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1293 if (type == 0) {
1294 suword(&job->_aiocb_private.status, 0);
1295 suword(&job->_aiocb_private.error, EINVAL);
1296 }
1297 return EINVAL;
1298 }
1299
1300 suword(&job->_aiocb_private.error, EINPROGRESS);
1301 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1302 aiocbe->userproc = p;
1303 aiocbe->jobflags = 0;
1304 aiocbe->lio = lj;
1305 ki = p->p_aioinfo;
1306
1307 if ((error = aio_qphysio(p, aiocbe)) == 0) {
1308 return 0;
1309 } else if (error > 0) {
1310 suword(&job->_aiocb_private.status, 0);
1311 aiocbe->uaiocb._aiocb_private.error = error;
1312 suword(&job->_aiocb_private.error, error);
1313 return error;
1314 }
1315
1316 /*
1317 * No buffer for daemon I/O
1318 */
1319 aiocbe->bp = NULL;
1320
1321 ki->kaio_queue_count++;
1322 if (lj) {
1323 lj->lioj_queue_count++;
1324 }
1325 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1326 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1327 aiocbe->jobstate = JOBST_JOBQGLOBAL;
1328
1329 num_queue_count++;
1330 error = 0;
1331
1332 /*
1333 * If we don't have a free AIO process, and we are below our
1334 * quota, then start one. Otherwise, depend on the subsequent
1335 * I/O completions to pick-up this job. If we don't sucessfully
1336 * create the new process (thread) due to resource issues, we
1337 * return an error for now (EAGAIN), which is likely not the
1338 * correct thing to do.
1339 */
1340 retryproc:
1341 if (aiop = TAILQ_FIRST(&aio_freeproc)) {
1342 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1343 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1344 aiop->aioprocflags &= ~AIOP_FREE;
1345 wakeup(aiop->aioproc);
1346 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1347 ((ki->kaio_active_count + num_aio_resv_start) <
1348 ki->kaio_maxactive_count)) {
1349 num_aio_resv_start++;
1350 if ((error = aio_newproc()) == 0) {
1351 num_aio_resv_start--;
1352 p->p_retval[0] = 0;
1353 goto retryproc;
1354 }
1355 num_aio_resv_start--;
1356 }
1357 return error;
1358 }
1359
1360 /*
1361 * This routine queues an AIO request, checking for quotas.
1362 */
1363 static int
1364 aio_aqueue(struct proc *p, struct aiocb *job, int type)
1365 {
1366 struct kaioinfo *ki;
1367
1368 if (p->p_aioinfo == NULL) {
1369 aio_init_aioinfo(p);
1370 }
1371
1372 if (num_queue_count >= max_queue_count)
1373 return EAGAIN;
1374
1375 ki = p->p_aioinfo;
1376 if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1377 return EAGAIN;
1378
1379 return _aio_aqueue(p, job, NULL, type);
1380 }
1381
1382 /*
1383 * Support the aio_return system call, as a side-effect, kernel
1384 * resources are released.
1385 */
1386 int
1387 aio_return(struct proc *p, struct aio_return_args *uap)
1388 {
1389 int s;
1390 int jobref;
1391 struct aiocblist *cb, *ncb;
1392 struct aiocb *ujob;
1393 struct kaioinfo *ki;
1394
1395 ki = p->p_aioinfo;
1396 if (ki == NULL) {
1397 return EINVAL;
1398 }
1399
1400 ujob = uap->aiocbp;
1401
1402 jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1403 if (jobref == -1 || jobref == 0)
1404 return EINVAL;
1405
1406 for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1407 cb;
1408 cb = TAILQ_NEXT(cb, plist)) {
1409 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1410 if (ujob == cb->uuaiocb) {
1411 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
1412 } else {
1413 p->p_retval[0] = EFAULT;
1414 }
1415 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1416 curproc->p_stats->p_ru.ru_oublock += cb->outputcharge;
1417 cb->outputcharge = 0;
1418 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1419 curproc->p_stats->p_ru.ru_inblock += cb->inputcharge;
1420 cb->inputcharge = 0;
1421 }
1422 aio_free_entry(cb);
1423 return 0;
1424 }
1425 }
1426
1427 s = splbio();
1428 for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1429 cb;
1430 cb = ncb) {
1431 ncb = TAILQ_NEXT(cb, plist);
1432 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1433 splx(s);
1434 if (ujob == cb->uuaiocb) {
1435 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
1436 } else {
1437 p->p_retval[0] = EFAULT;
1438 }
1439 aio_free_entry(cb);
1440 return 0;
1441 }
1442 }
1443 splx(s);
1444
1445 return (EINVAL);
1446 }
1447
1448 /*
1449 * Allow a process to wakeup when any of the I/O requests are
1450 * completed.
1451 */
1452 int
1453 aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1454 {
1455 struct timeval atv;
1456 struct timespec ts;
1457 struct aiocb *const *cbptr, *cbp;
1458 struct kaioinfo *ki;
1459 struct aiocblist *cb;
1460 int i;
1461 int njoblist;
1462 int error, s, timo;
1463 int *ijoblist;
1464 struct aiocb **ujoblist;
1465
1466 if (uap->nent >= AIO_LISTIO_MAX)
1467 return EINVAL;
1468
1469 timo = 0;
1470 if (uap->timeout) {
1471 /*
1472 * Get timespec struct
1473 */
1474 if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
1475 return error;
1476 }
1477
1478 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1479 return (EINVAL);
1480
1481 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1482 if (itimerfix(&atv))
1483 return (EINVAL);
1484 timo = tvtohz(&atv);
1485 }
1486
1487 ki = p->p_aioinfo;
1488 if (ki == NULL)
1489 return EAGAIN;
1490
1491 njoblist = 0;
1492 ijoblist = zalloc(aiol_zone);
1493 ujoblist = zalloc(aiol_zone);
1494 cbptr = uap->aiocbp;
1495
1496 for(i = 0; i < uap->nent; i++) {
1497 cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
1498 if (cbp == 0)
1499 continue;
1500 ujoblist[njoblist] = cbp;
1501 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1502 njoblist++;
1503 }
1504 if (njoblist == 0) {
1505 zfree(aiol_zone, ijoblist);
1506 zfree(aiol_zone, ujoblist);
1507 return 0;
1508 }
1509
1510 error = 0;
1511 while (1) {
1512 for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1513 cb; cb = TAILQ_NEXT(cb, plist)) {
1514 for(i = 0; i < njoblist; i++) {
1515 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1516 ijoblist[i]) {
1517 if (ujoblist[i] != cb->uuaiocb)
1518 error = EINVAL;
1519 zfree(aiol_zone, ijoblist);
1520 zfree(aiol_zone, ujoblist);
1521 return error;
1522 }
1523 }
1524 }
1525
1526 s = splbio();
1527 for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1528 cb; cb = TAILQ_NEXT(cb, plist)) {
1529 for(i = 0; i < njoblist; i++) {
1530 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1531 ijoblist[i]) {
1532 splx(s);
1533 if (ujoblist[i] != cb->uuaiocb)
1534 error = EINVAL;
1535 zfree(aiol_zone, ijoblist);
1536 zfree(aiol_zone, ujoblist);
1537 return error;
1538 }
1539 }
1540 }
1541
1542 ki->kaio_flags |= KAIO_WAKEUP;
1543 error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
1544 splx(s);
1545
1546 if (error == EINTR) {
1547 zfree(aiol_zone, ijoblist);
1548 zfree(aiol_zone, ujoblist);
1549 return EINTR;
1550 } else if (error == EWOULDBLOCK) {
1551 zfree(aiol_zone, ijoblist);
1552 zfree(aiol_zone, ujoblist);
1553 return EAGAIN;
1554 }
1555 }
1556
1557 /* NOTREACHED */
1558 return EINVAL;
1559 }
1560
1561 /*
1562 * aio_cancel at the kernel level is a NOOP right now. It
1563 * might be possible to support it partially in user mode, or
1564 * in kernel mode later on.
1565 */
1566 int
1567 aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1568 {
1569 return ENOSYS;
1570 }
1571
1572 /*
1573 * aio_error is implemented in the kernel level for compatibility
1574 * purposes only. For a user mode async implementation, it would be
1575 * best to do it in a userland subroutine.
1576 */
1577 int
1578 aio_error(struct proc *p, struct aio_error_args *uap)
1579 {
1580 int s;
1581 struct aiocblist *cb;
1582 struct kaioinfo *ki;
1583 int jobref;
1584
1585 ki = p->p_aioinfo;
1586 if (ki == NULL)
1587 return EINVAL;
1588
1589 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1590 if ((jobref == -1) || (jobref == 0))
1591 return EINVAL;
1592
1593 for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1594 cb;
1595 cb = TAILQ_NEXT(cb, plist)) {
1596
1597 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1598 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1599 return 0;
1600 }
1601 }
1602
1603 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
1604 cb;
1605 cb = TAILQ_NEXT(cb, plist)) {
1606
1607 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1608 p->p_retval[0] = EINPROGRESS;
1609 return 0;
1610 }
1611 }
1612
1613 s = splbio();
1614 for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1615 cb;
1616 cb = TAILQ_NEXT(cb, plist)) {
1617 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1618 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1619 splx(s);
1620 return 0;
1621 }
1622 }
1623
1624 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue);
1625 cb;
1626 cb = TAILQ_NEXT(cb, plist)) {
1627 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1628 p->p_retval[0] = EINPROGRESS;
1629 splx(s);
1630 return 0;
1631 }
1632 }
1633 splx(s);
1634
1635
1636 /*
1637 * Hack for lio
1638 */
1639 /*
1640 status = fuword(&uap->aiocbp->_aiocb_private.status);
1641 if (status == -1) {
1642 return fuword(&uap->aiocbp->_aiocb_private.error);
1643 }
1644 */
1645 return EINVAL;
1646 }
1647
1648 int
1649 aio_read(struct proc *p, struct aio_read_args *uap)
1650 {
1651 struct filedesc *fdp;
1652 struct file *fp;
1653 struct uio auio;
1654 struct iovec aiov;
1655 unsigned int fd;
1656 int cnt;
1657 struct aiocb iocb;
1658 int error, pmodes;
1659
1660 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1661 if ((pmodes & AIO_PMODE_SYNC) == 0) {
1662 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
1663 }
1664
1665 /*
1666 * Get control block
1667 */
1668 if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1669 return error;
1670
1671 /*
1672 * Get the fd info for process
1673 */
1674 fdp = p->p_fd;
1675
1676 /*
1677 * Range check file descriptor
1678 */
1679 fd = iocb.aio_fildes;
1680 if (fd >= fdp->fd_nfiles)
1681 return EBADF;
1682 fp = fdp->fd_ofiles[fd];
1683 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
1684 return EBADF;
1685 if (iocb.aio_offset == -1LL)
1686 return EINVAL;
1687
1688 auio.uio_resid = iocb.aio_nbytes;
1689 if (auio.uio_resid < 0)
1690 return (EINVAL);
1691
1692 /*
1693 * Process sync simply -- queue async request.
1694 */
1695 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
1696 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
1697 }
1698
1699 aiov.iov_base = (void *) iocb.aio_buf;
1700 aiov.iov_len = iocb.aio_nbytes;
1701
1702 auio.uio_iov = &aiov;
1703 auio.uio_iovcnt = 1;
1704 auio.uio_offset = iocb.aio_offset;
1705 auio.uio_rw = UIO_READ;
1706 auio.uio_segflg = UIO_USERSPACE;
1707 auio.uio_procp = p;
1708
1709 cnt = iocb.aio_nbytes;
1710 error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, FOF_OFFSET);
1711 if (error &&
1712 (auio.uio_resid != cnt) &&
1713 (error == ERESTART || error == EINTR || error == EWOULDBLOCK))
1714 error = 0;
1715 cnt -= auio.uio_resid;
1716 p->p_retval[0] = cnt;
1717 return error;
1718 }
1719
1720 int
1721 aio_write(struct proc *p, struct aio_write_args *uap)
1722 {
1723 struct filedesc *fdp;
1724 struct file *fp;
1725 struct uio auio;
1726 struct iovec aiov;
1727 unsigned int fd;
1728 int cnt;
1729 struct aiocb iocb;
1730 int error;
1731 int pmodes;
1732
1733 /*
1734 * Process sync simply -- queue async request.
1735 */
1736 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1737 if ((pmodes & AIO_PMODE_SYNC) == 0) {
1738 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
1739 }
1740
1741 if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1742 return error;
1743
1744 /*
1745 * Get the fd info for process
1746 */
1747 fdp = p->p_fd;
1748
1749 /*
1750 * Range check file descriptor
1751 */
1752 fd = iocb.aio_fildes;
1753 if (fd >= fdp->fd_nfiles)
1754 return EBADF;
1755 fp = fdp->fd_ofiles[fd];
1756 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1757 return EBADF;
1758 if (iocb.aio_offset == -1LL)
1759 return EINVAL;
1760
1761 aiov.iov_base = (void *) iocb.aio_buf;
1762 aiov.iov_len = iocb.aio_nbytes;
1763 auio.uio_iov = &aiov;
1764 auio.uio_iovcnt = 1;
1765 auio.uio_offset = iocb.aio_offset;
1766
1767 auio.uio_resid = iocb.aio_nbytes;
1768 if (auio.uio_resid < 0)
1769 return (EINVAL);
1770
1771 auio.uio_rw = UIO_WRITE;
1772 auio.uio_segflg = UIO_USERSPACE;
1773 auio.uio_procp = p;
1774
1775 cnt = iocb.aio_nbytes;
1776 error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, FOF_OFFSET);
1777 if (error) {
1778 if (auio.uio_resid != cnt) {
1779 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
1780 error = 0;
1781 if (error == EPIPE)
1782 psignal(p, SIGPIPE);
1783 }
1784 }
1785 cnt -= auio.uio_resid;
1786 p->p_retval[0] = cnt;
1787 return error;
1788 }
1789
1790 int
1791 lio_listio(struct proc *p, struct lio_listio_args *uap)
1792 {
1793 int nent, nentqueued;
1794 struct aiocb *iocb, * const *cbptr;
1795 struct aiocblist *cb;
1796 struct kaioinfo *ki;
1797 struct aio_liojob *lj;
1798 int error, runningcode;
1799 int nerror;
1800 int i;
1801 int s;
1802
1803 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) {
1804 return EINVAL;
1805 }
1806
1807 nent = uap->nent;
1808 if (nent > AIO_LISTIO_MAX) {
1809 return EINVAL;
1810 }
1811
1812 if (p->p_aioinfo == NULL) {
1813 aio_init_aioinfo(p);
1814 }
1815
1816 if ((nent + num_queue_count) > max_queue_count) {
1817 return EAGAIN;
1818 }
1819
1820 ki = p->p_aioinfo;
1821 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) {
1822 return EAGAIN;
1823 }
1824
1825 lj = zalloc(aiolio_zone);
1826 if (!lj) {
1827 return EAGAIN;
1828 }
1829
1830 lj->lioj_flags = 0;
1831 lj->lioj_buffer_count = 0;
1832 lj->lioj_buffer_finished_count = 0;
1833 lj->lioj_queue_count = 0;
1834 lj->lioj_queue_finished_count = 0;
1835 lj->lioj_ki = ki;
1836 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1837
1838 /*
1839 * Setup signal
1840 */
1841 if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1842 error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal);
1843 if (error)
1844 return error;
1845 lj->lioj_flags |= LIOJ_SIGNAL;
1846 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1847 } else {
1848 lj->lioj_flags &= ~LIOJ_SIGNAL;
1849 }
1850
1851 /*
1852 * get pointers to the list of I/O requests
1853 */
1854
1855 nerror = 0;
1856 nentqueued = 0;
1857 cbptr = uap->acb_list;
1858 for(i = 0; i < uap->nent; i++) {
1859 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
1860 if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) {
1861 error = _aio_aqueue(p, iocb, lj, 0);
1862 if (error == 0) {
1863 nentqueued++;
1864 } else {
1865 nerror++;
1866 }
1867 }
1868 }
1869
1870 /*
1871 * If we haven't queued any, then just return error
1872 */
1873 if (nentqueued == 0) {
1874 return 0;
1875 }
1876
1877 /*
1878 * Calculate the appropriate error return
1879 */
1880 runningcode = 0;
1881 if (nerror)
1882 runningcode = EIO;
1883
1884 if (uap->mode == LIO_WAIT) {
1885 while (1) {
1886 int found;
1887 found = 0;
1888 for(i = 0; i < uap->nent; i++) {
1889 int jobref, command;
1890
1891 /*
1892 * Fetch address of the control buf pointer in user space
1893 */
1894 iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
1895 if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0))
1896 continue;
1897
1898 /*
1899 * Fetch the associated command from user space
1900 */
1901 command = fuword(&iocb->aio_lio_opcode);
1902 if (command == LIO_NOP) {
1903 found++;
1904 continue;
1905 }
1906
1907 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
1908
1909 for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1910 cb;
1911 cb = TAILQ_NEXT(cb, plist)) {
1912 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1913 jobref) {
1914 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1915 curproc->p_stats->p_ru.ru_oublock +=
1916 cb->outputcharge;
1917 cb->outputcharge = 0;
1918 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1919 curproc->p_stats->p_ru.ru_inblock +=
1920 cb->inputcharge;
1921 cb->inputcharge = 0;
1922 }
1923 found++;
1924 break;
1925 }
1926 }
1927
1928 s = splbio();
1929 for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1930 cb;
1931 cb = TAILQ_NEXT(cb, plist)) {
1932 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1933 jobref) {
1934 found++;
1935 break;
1936 }
1937 }
1938 splx(s);
1939
1940 }
1941
1942 /*
1943 * If all I/Os have been disposed of, then we can return
1944 */
1945 if (found == nentqueued) {
1946 return runningcode;
1947 }
1948
1949 ki->kaio_flags |= KAIO_WAKEUP;
1950 error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
1951
1952 if (error == EINTR) {
1953 return EINTR;
1954 } else if (error == EWOULDBLOCK) {
1955 return EAGAIN;
1956 }
1957
1958 }
1959 }
1960
1961 return runningcode;
1962 }
1963
1964 /*
1965 * This is a wierd hack so that we can post a signal. It is safe
1966 * to do so from a timeout routine, but *not* from an interrupt routine.
1967 */
1968 static void
1969 process_signal(void *ljarg)
1970 {
1971 struct aio_liojob *lj = ljarg;
1972 if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) {
1973 if (lj->lioj_queue_count == lj->lioj_queue_finished_count) {
1974 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
1975 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1976 }
1977 }
1978 }
1979
1980 /*
1981 * Interrupt handler for physio, performs the necessary process wakeups,
1982 * and signals.
1983 */
1984 static void
1985 aio_physwakeup(bp)
1986 struct buf *bp;
1987 {
1988 struct aiocblist *aiocbe;
1989 struct proc *p;
1990 struct kaioinfo *ki;
1991 struct aio_liojob *lj;
1992 int s;
1993 s = splbio();
1994
1995 wakeup((caddr_t) bp);
1996 bp->b_flags &= ~B_CALL;
1997 bp->b_flags |= B_DONE;
1998
1999 aiocbe = (struct aiocblist *)bp->b_spc;
2000 if (aiocbe) {
2001 p = bp->b_proc;
2002
2003 aiocbe->jobstate = JOBST_JOBBFINISHED;
2004 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2005 aiocbe->uaiocb._aiocb_private.error = 0;
2006 aiocbe->jobflags |= AIOCBLIST_DONE;
2007
2008 if (bp->b_flags & B_ERROR) {
2009 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2010 }
2011
2012 lj = aiocbe->lio;
2013 if (lj) {
2014 lj->lioj_buffer_finished_count++;
2015 /*
2016 * wakeup/signal if all of the interrupt jobs are done
2017 */
2018 if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) {
2019 /*
2020 * post a signal if it is called for
2021 */
2022 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2023 LIOJ_SIGNAL) {
2024 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2025 timeout(process_signal, lj, 0);
2026 }
2027 }
2028 }
2029
2030 ki = p->p_aioinfo;
2031 if (ki) {
2032 ki->kaio_buffer_finished_count++;
2033 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2034 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2035 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2036 /*
2037 * and do the wakeup
2038 */
2039 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2040 ki->kaio_flags &= ~KAIO_WAKEUP;
2041 wakeup(p);
2042 }
2043 }
2044 }
2045 splx(s);
2046 }
Cache object: 7bb4b6469b58de6c3df318460065a756
|