FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_subr.c
1 /*
2 * Copyright (c) 1982, 1986, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
39 * $FreeBSD: releng/5.0/sys/kern/kern_subr.c 120697 2003-10-03 16:57:38Z nectar $
40 */
41
42 #include "opt_zero.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/ktr.h>
48 #include <sys/lock.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/malloc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/sched.h>
54 #include <sys/sysctl.h>
55 #include <sys/vnode.h>
56 #include <machine/limits.h>
57
58 #include <vm/vm.h>
59 #include <vm/vm_page.h>
60 #include <vm/vm_map.h>
61 #ifdef ZERO_COPY_SOCKETS
62 #include <vm/vm_param.h>
63 #endif
64 #if defined(ZERO_COPY_SOCKETS) || defined(ENABLE_VFS_IOOPT)
65 #include <vm/vm_object.h>
66 #endif
67
68 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
69 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
70
71 #if defined(ZERO_COPY_SOCKETS) || defined(ENABLE_VFS_IOOPT)
72 static int userspaceco(caddr_t cp, u_int cnt, struct uio *uio,
73 struct vm_object *obj, int disposable);
74 #endif
75
76 #ifdef ZERO_COPY_SOCKETS
77 /* Declared in uipc_socket.c */
78 extern int so_zero_copy_receive;
79
80 static int vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr,
81 vm_offset_t uaddr);
82
83 static int
84 vm_pgmoveco(mapa, srcobj, kaddr, uaddr)
85 vm_map_t mapa;
86 vm_object_t srcobj;
87 vm_offset_t kaddr, uaddr;
88 {
89 vm_map_t map = mapa;
90 vm_page_t kern_pg, user_pg;
91 vm_object_t uobject;
92 vm_map_entry_t entry;
93 vm_pindex_t upindex, kpindex;
94 vm_prot_t prot;
95 boolean_t wired;
96
97 /*
98 * First lookup the kernel page.
99 */
100 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
101
102 if ((vm_map_lookup(&map, uaddr,
103 VM_PROT_READ, &entry, &uobject,
104 &upindex, &prot, &wired)) != KERN_SUCCESS) {
105 return(EFAULT);
106 }
107 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
108 do
109 vm_page_lock_queues();
110 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco"));
111 vm_page_busy(user_pg);
112 pmap_remove_all(user_pg);
113 vm_page_free(user_pg);
114 vm_page_unlock_queues();
115 }
116
117 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) ||
118 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) {
119 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), "
120 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex,
121 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0,
122 kern_pg->hold_count, (u_long)kern_pg->phys_addr);
123 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE)
124 panic("vm_pgmoveco: renaming free page");
125 else
126 panic("vm_pgmoveco: renaming busy page");
127 }
128 kpindex = kern_pg->pindex;
129 vm_page_busy(kern_pg);
130 vm_page_rename(kern_pg, uobject, upindex);
131 vm_page_flag_clear(kern_pg, PG_BUSY);
132 kern_pg->valid = VM_PAGE_BITS_ALL;
133
134 vm_map_lookup_done(map, entry);
135 return(KERN_SUCCESS);
136 }
137 #endif /* ZERO_COPY_SOCKETS */
138
139 int
140 uiomove(cp, n, uio)
141 register caddr_t cp;
142 register int n;
143 register struct uio *uio;
144 {
145 struct thread *td = curthread;
146 register struct iovec *iov;
147 u_int cnt;
148 int error = 0;
149 int save = 0;
150
151 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
152 ("uiomove: mode"));
153 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
154 ("uiomove proc"));
155
156 if (td) {
157 mtx_lock_spin(&sched_lock);
158 save = td->td_flags & TDF_DEADLKTREAT;
159 td->td_flags |= TDF_DEADLKTREAT;
160 mtx_unlock_spin(&sched_lock);
161 }
162
163 while (n > 0 && uio->uio_resid) {
164 iov = uio->uio_iov;
165 cnt = iov->iov_len;
166 if (cnt == 0) {
167 uio->uio_iov++;
168 uio->uio_iovcnt--;
169 continue;
170 }
171 if (cnt > n)
172 cnt = n;
173
174 switch (uio->uio_segflg) {
175
176 case UIO_USERSPACE:
177 if (ticks - PCPU_GET(switchticks) >= hogticks)
178 uio_yield();
179 if (uio->uio_rw == UIO_READ)
180 error = copyout(cp, iov->iov_base, cnt);
181 else
182 error = copyin(iov->iov_base, cp, cnt);
183 if (error)
184 goto out;
185 break;
186
187 case UIO_SYSSPACE:
188 if (uio->uio_rw == UIO_READ)
189 bcopy(cp, iov->iov_base, cnt);
190 else
191 bcopy(iov->iov_base, cp, cnt);
192 break;
193 case UIO_NOCOPY:
194 break;
195 }
196 iov->iov_base = (char *)iov->iov_base + cnt;
197 iov->iov_len -= cnt;
198 uio->uio_resid -= cnt;
199 uio->uio_offset += cnt;
200 cp += cnt;
201 n -= cnt;
202 }
203 out:
204 if (td != curthread) printf("uiomove: IT CHANGED!");
205 td = curthread; /* Might things have changed in copyin/copyout? */
206 if (td) {
207 mtx_lock_spin(&sched_lock);
208 td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save;
209 mtx_unlock_spin(&sched_lock);
210 }
211 return (error);
212 }
213
214 /*
215 * Wrapper for uiomove() that validates the arguments against a known-good
216 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which
217 * is almost definitely a bad thing, so we catch that here as well. We
218 * return a runtime failure, but it might be desirable to generate a runtime
219 * assertion failure instead.
220 */
221 int
222 uiomove_frombuf(void *buf, int buflen, struct uio *uio)
223 {
224 unsigned int offset, n;
225
226 if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
227 (offset = uio->uio_offset) != uio->uio_offset)
228 return (EINVAL);
229 if (buflen <= 0 || offset >= buflen)
230 return (0);
231 if ((n = buflen - offset) > INT_MAX)
232 return (EINVAL);
233 return (uiomove((char *)buf + offset, n, uio));
234 }
235
236 #if defined(ENABLE_VFS_IOOPT) || defined(ZERO_COPY_SOCKETS)
237 /*
238 * Experimental support for zero-copy I/O
239 */
240 static int
241 userspaceco(cp, cnt, uio, obj, disposable)
242 caddr_t cp;
243 u_int cnt;
244 struct uio *uio;
245 struct vm_object *obj;
246 int disposable;
247 {
248 struct iovec *iov;
249 int error;
250
251 iov = uio->uio_iov;
252
253 #ifdef ZERO_COPY_SOCKETS
254
255 if (uio->uio_rw == UIO_READ) {
256 if ((so_zero_copy_receive != 0)
257 && (obj != NULL)
258 && ((cnt & PAGE_MASK) == 0)
259 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
260 && ((uio->uio_offset & PAGE_MASK) == 0)
261 && ((((intptr_t) cp) & PAGE_MASK) == 0)
262 && (obj->type == OBJT_DEFAULT)
263 && (disposable != 0)) {
264 /* SOCKET: use page-trading */
265 /*
266 * We only want to call vm_pgmoveco() on
267 * disposeable pages, since it gives the
268 * kernel page to the userland process.
269 */
270 error = vm_pgmoveco(&curproc->p_vmspace->vm_map,
271 obj, (vm_offset_t)cp,
272 (vm_offset_t)iov->iov_base);
273
274 /*
275 * If we get an error back, attempt
276 * to use copyout() instead. The
277 * disposable page should be freed
278 * automatically if we weren't able to move
279 * it into userland.
280 */
281 if (error != 0)
282 error = copyout(cp, iov->iov_base, cnt);
283 #ifdef ENABLE_VFS_IOOPT
284 } else if ((vfs_ioopt != 0)
285 && ((cnt & PAGE_MASK) == 0)
286 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
287 && ((uio->uio_offset & PAGE_MASK) == 0)
288 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
289 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
290 uio->uio_offset, cnt,
291 (vm_offset_t) iov->iov_base, NULL);
292 #endif /* ENABLE_VFS_IOOPT */
293 } else {
294 error = copyout(cp, iov->iov_base, cnt);
295 }
296 } else {
297 error = copyin(iov->iov_base, cp, cnt);
298 }
299 #else /* ZERO_COPY_SOCKETS */
300 if (uio->uio_rw == UIO_READ) {
301 #ifdef ENABLE_VFS_IOOPT
302 if ((vfs_ioopt != 0)
303 && ((cnt & PAGE_MASK) == 0)
304 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
305 && ((uio->uio_offset & PAGE_MASK) == 0)
306 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
307 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
308 uio->uio_offset, cnt,
309 (vm_offset_t) iov->iov_base, NULL);
310 } else
311 #endif /* ENABLE_VFS_IOOPT */
312 {
313 error = copyout(cp, iov->iov_base, cnt);
314 }
315 } else {
316 error = copyin(iov->iov_base, cp, cnt);
317 }
318 #endif /* ZERO_COPY_SOCKETS */
319
320 return (error);
321 }
322
323 int
324 uiomoveco(cp, n, uio, obj, disposable)
325 caddr_t cp;
326 int n;
327 struct uio *uio;
328 struct vm_object *obj;
329 int disposable;
330 {
331 struct iovec *iov;
332 u_int cnt;
333 int error;
334
335 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
336 ("uiomoveco: mode"));
337 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
338 ("uiomoveco proc"));
339
340 while (n > 0 && uio->uio_resid) {
341 iov = uio->uio_iov;
342 cnt = iov->iov_len;
343 if (cnt == 0) {
344 uio->uio_iov++;
345 uio->uio_iovcnt--;
346 continue;
347 }
348 if (cnt > n)
349 cnt = n;
350
351 switch (uio->uio_segflg) {
352
353 case UIO_USERSPACE:
354 if (ticks - PCPU_GET(switchticks) >= hogticks)
355 uio_yield();
356
357 error = userspaceco(cp, cnt, uio, obj, disposable);
358
359 if (error)
360 return (error);
361 break;
362
363 case UIO_SYSSPACE:
364 if (uio->uio_rw == UIO_READ)
365 bcopy(cp, iov->iov_base, cnt);
366 else
367 bcopy(iov->iov_base, cp, cnt);
368 break;
369 case UIO_NOCOPY:
370 break;
371 }
372 iov->iov_base = (char *)iov->iov_base + cnt;
373 iov->iov_len -= cnt;
374 uio->uio_resid -= cnt;
375 uio->uio_offset += cnt;
376 cp += cnt;
377 n -= cnt;
378 }
379 return (0);
380 }
381 #endif /* ENABLE_VFS_IOOPT || ZERO_COPY_SOCKETS */
382
383 #ifdef ENABLE_VFS_IOOPT
384
385 /*
386 * Experimental support for zero-copy I/O
387 */
388 int
389 uioread(n, uio, obj, nread)
390 int n;
391 struct uio *uio;
392 struct vm_object *obj;
393 int *nread;
394 {
395 int npagesmoved;
396 struct iovec *iov;
397 u_int cnt, tcnt;
398 int error;
399
400 *nread = 0;
401 if (vfs_ioopt < 2)
402 return 0;
403
404 error = 0;
405
406 while (n > 0 && uio->uio_resid) {
407 iov = uio->uio_iov;
408 cnt = iov->iov_len;
409 if (cnt == 0) {
410 uio->uio_iov++;
411 uio->uio_iovcnt--;
412 continue;
413 }
414 if (cnt > n)
415 cnt = n;
416
417 if ((uio->uio_segflg == UIO_USERSPACE) &&
418 ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
419 ((uio->uio_offset & PAGE_MASK) == 0) ) {
420
421 if (cnt < PAGE_SIZE)
422 break;
423
424 cnt &= ~PAGE_MASK;
425
426 if (ticks - PCPU_GET(switchticks) >= hogticks)
427 uio_yield();
428 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
429 uio->uio_offset, cnt,
430 (vm_offset_t) iov->iov_base, &npagesmoved);
431
432 if (npagesmoved == 0)
433 break;
434
435 tcnt = npagesmoved * PAGE_SIZE;
436 cnt = tcnt;
437
438 if (error)
439 break;
440
441 iov->iov_base = (char *)iov->iov_base + cnt;
442 iov->iov_len -= cnt;
443 uio->uio_resid -= cnt;
444 uio->uio_offset += cnt;
445 *nread += cnt;
446 n -= cnt;
447 } else {
448 break;
449 }
450 }
451 return error;
452 }
453 #endif /* ENABLE_VFS_IOOPT */
454
455 /*
456 * Give next character to user as result of read.
457 */
458 int
459 ureadc(c, uio)
460 register int c;
461 register struct uio *uio;
462 {
463 register struct iovec *iov;
464 register char *iov_base;
465
466 again:
467 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
468 panic("ureadc");
469 iov = uio->uio_iov;
470 if (iov->iov_len == 0) {
471 uio->uio_iovcnt--;
472 uio->uio_iov++;
473 goto again;
474 }
475 switch (uio->uio_segflg) {
476
477 case UIO_USERSPACE:
478 if (subyte(iov->iov_base, c) < 0)
479 return (EFAULT);
480 break;
481
482 case UIO_SYSSPACE:
483 iov_base = iov->iov_base;
484 *iov_base = c;
485 iov->iov_base = iov_base;
486 break;
487
488 case UIO_NOCOPY:
489 break;
490 }
491 iov->iov_base = (char *)iov->iov_base + 1;
492 iov->iov_len--;
493 uio->uio_resid--;
494 uio->uio_offset++;
495 return (0);
496 }
497
498 /*
499 * General routine to allocate a hash table.
500 */
501 void *
502 hashinit(elements, type, hashmask)
503 int elements;
504 struct malloc_type *type;
505 u_long *hashmask;
506 {
507 long hashsize;
508 LIST_HEAD(generic, generic) *hashtbl;
509 int i;
510
511 if (elements <= 0)
512 panic("hashinit: bad elements");
513 for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
514 continue;
515 hashsize >>= 1;
516 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
517 for (i = 0; i < hashsize; i++)
518 LIST_INIT(&hashtbl[i]);
519 *hashmask = hashsize - 1;
520 return (hashtbl);
521 }
522
523 void
524 hashdestroy(vhashtbl, type, hashmask)
525 void *vhashtbl;
526 struct malloc_type *type;
527 u_long hashmask;
528 {
529 LIST_HEAD(generic, generic) *hashtbl, *hp;
530
531 hashtbl = vhashtbl;
532 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++)
533 if (!LIST_EMPTY(hp))
534 panic("hashdestroy: hash not empty");
535 free(hashtbl, type);
536 }
537
538 static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
539 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
540 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
541 #define NPRIMES (sizeof(primes) / sizeof(primes[0]))
542
543 /*
544 * General routine to allocate a prime number sized hash table.
545 */
546 void *
547 phashinit(elements, type, nentries)
548 int elements;
549 struct malloc_type *type;
550 u_long *nentries;
551 {
552 long hashsize;
553 LIST_HEAD(generic, generic) *hashtbl;
554 int i;
555
556 if (elements <= 0)
557 panic("phashinit: bad elements");
558 for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
559 i++;
560 if (i == NPRIMES)
561 break;
562 hashsize = primes[i];
563 }
564 hashsize = primes[i - 1];
565 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
566 for (i = 0; i < hashsize; i++)
567 LIST_INIT(&hashtbl[i]);
568 *nentries = hashsize;
569 return (hashtbl);
570 }
571
572 void
573 uio_yield()
574 {
575 struct thread *td;
576
577 td = curthread;
578 mtx_lock_spin(&sched_lock);
579 DROP_GIANT();
580 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */
581 td->td_proc->p_stats->p_ru.ru_nivcsw++;
582 mi_switch();
583 mtx_unlock_spin(&sched_lock);
584 PICKUP_GIANT();
585 }
586
587 int
588 copyinfrom(const void *src, void *dst, size_t len, int seg)
589 {
590 int error = 0;
591
592 switch (seg) {
593 case UIO_USERSPACE:
594 error = copyin(src, dst, len);
595 break;
596 case UIO_SYSSPACE:
597 bcopy(src, dst, len);
598 break;
599 default:
600 panic("copyinfrom: bad seg %d\n", seg);
601 }
602 return (error);
603 }
604
605 int
606 copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg)
607 {
608 int error = 0;
609
610 switch (seg) {
611 case UIO_USERSPACE:
612 error = copyinstr(src, dst, len, copied);
613 break;
614 case UIO_SYSSPACE:
615 error = copystr(src, dst, len, copied);
616 break;
617 default:
618 panic("copyinstrfrom: bad seg %d\n", seg);
619 }
620 return (error);
621 }
Cache object: cd61b4d78a01ceeb6ecf9ddc04fa1489
|