FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c
1 /*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD$
10 *
11 */
12
13 /*-
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 * The Regents of the University of California. All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 4. Neither the name of the University nor the names of its contributors
35 * may be used to endorse or promote products derived from this software
36 * without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 *
50 * from: Utah Hdr: vn.c 1.13 94/04/02
51 *
52 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94
53 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
54 */
55
56 #include "opt_geom.h"
57 #include "opt_md.h"
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/conf.h>
63 #include <sys/fcntl.h>
64 #include <sys/kernel.h>
65 #include <sys/kthread.h>
66 #include <sys/linker.h>
67 #include <sys/lock.h>
68 #include <sys/malloc.h>
69 #include <sys/mdioctl.h>
70 #include <sys/mutex.h>
71 #include <sys/namei.h>
72 #include <sys/proc.h>
73 #include <sys/queue.h>
74 #include <sys/sf_buf.h>
75 #include <sys/sysctl.h>
76 #include <sys/vnode.h>
77
78 #include <geom/geom.h>
79
80 #include <vm/vm.h>
81 #include <vm/vm_object.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pager.h>
84 #include <vm/swap_pager.h>
85 #include <vm/uma.h>
86
87 #define MD_MODVER 1
88
89 #define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */
90
91 #ifndef MD_NSECT
92 #define MD_NSECT (10000 * 2)
93 #endif
94
95 static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
96 static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
97
98 static int md_debug;
99 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
100
101 #if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
102 /* Image gets put here: */
103 static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
104 static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
105 #endif
106
107 static g_init_t g_md_init;
108 static g_fini_t g_md_fini;
109 static g_start_t g_md_start;
110 static g_access_t g_md_access;
111
112 static int mdunits;
113 static struct cdev *status_dev = 0;
114
115 static d_ioctl_t mdctlioctl;
116
117 static struct cdevsw mdctl_cdevsw = {
118 .d_version = D_VERSION,
119 .d_flags = D_NEEDGIANT,
120 .d_ioctl = mdctlioctl,
121 .d_name = MD_NAME,
122 };
123
124 struct g_class g_md_class = {
125 .name = "MD",
126 .version = G_VERSION,
127 .init = g_md_init,
128 .fini = g_md_fini,
129 .start = g_md_start,
130 .access = g_md_access,
131 };
132
133 DECLARE_GEOM_CLASS(g_md_class, g_md);
134
135
136 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
137
138 #define NINDIR (PAGE_SIZE / sizeof(uintptr_t))
139 #define NMASK (NINDIR-1)
140 static int nshift;
141
142 struct indir {
143 uintptr_t *array;
144 u_int total;
145 u_int used;
146 u_int shift;
147 };
148
149 struct md_s {
150 int unit;
151 LIST_ENTRY(md_s) list;
152 struct bio_queue_head bio_queue;
153 struct mtx queue_mtx;
154 struct cdev *dev;
155 enum md_types type;
156 unsigned nsect;
157 unsigned opencount;
158 unsigned secsize;
159 unsigned fwheads;
160 unsigned fwsectors;
161 unsigned flags;
162 char name[20];
163 struct proc *procp;
164 struct g_geom *gp;
165 struct g_provider *pp;
166
167 /* MD_MALLOC related fields */
168 struct indir *indir;
169 uma_zone_t uma;
170
171 /* MD_PRELOAD related fields */
172 u_char *pl_ptr;
173 unsigned pl_len;
174
175 /* MD_VNODE related fields */
176 struct vnode *vnode;
177 struct ucred *cred;
178
179 /* MD_SWAP related fields */
180 vm_object_t object;
181 unsigned npage;
182 };
183
184 static int mddestroy(struct md_s *sc, struct thread *td);
185
186 static struct indir *
187 new_indir(u_int shift)
188 {
189 struct indir *ip;
190
191 ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
192 if (ip == NULL)
193 return (NULL);
194 ip->array = malloc(sizeof(uintptr_t) * NINDIR,
195 M_MDSECT, M_NOWAIT | M_ZERO);
196 if (ip->array == NULL) {
197 free(ip, M_MD);
198 return (NULL);
199 }
200 ip->total = NINDIR;
201 ip->shift = shift;
202 return (ip);
203 }
204
205 static void
206 del_indir(struct indir *ip)
207 {
208
209 free(ip->array, M_MDSECT);
210 free(ip, M_MD);
211 }
212
213 static void
214 destroy_indir(struct md_s *sc, struct indir *ip)
215 {
216 int i;
217
218 for (i = 0; i < NINDIR; i++) {
219 if (!ip->array[i])
220 continue;
221 if (ip->shift)
222 destroy_indir(sc, (struct indir*)(ip->array[i]));
223 else if (ip->array[i] > 255)
224 uma_zfree(sc->uma, (void *)(ip->array[i]));
225 }
226 del_indir(ip);
227 }
228
229 /*
230 * This function does the math and alloctes the top level "indir" structure
231 * for a device of "size" sectors.
232 */
233
234 static struct indir *
235 dimension(off_t size)
236 {
237 off_t rcnt;
238 struct indir *ip;
239 int i, layer;
240
241 rcnt = size;
242 layer = 0;
243 while (rcnt > NINDIR) {
244 rcnt /= NINDIR;
245 layer++;
246 }
247 /* figure out log2(NINDIR) */
248 for (i = NINDIR, nshift = -1; i; nshift++)
249 i >>= 1;
250
251 /*
252 * XXX: the top layer is probably not fully populated, so we allocate
253 * too much space for ip->array in here.
254 */
255 ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
256 ip->array = malloc(sizeof(uintptr_t) * NINDIR,
257 M_MDSECT, M_WAITOK | M_ZERO);
258 ip->total = NINDIR;
259 ip->shift = layer * nshift;
260 return (ip);
261 }
262
263 /*
264 * Read a given sector
265 */
266
267 static uintptr_t
268 s_read(struct indir *ip, off_t offset)
269 {
270 struct indir *cip;
271 int idx;
272 uintptr_t up;
273
274 if (md_debug > 1)
275 printf("s_read(%jd)\n", (intmax_t)offset);
276 up = 0;
277 for (cip = ip; cip != NULL;) {
278 if (cip->shift) {
279 idx = (offset >> cip->shift) & NMASK;
280 up = cip->array[idx];
281 cip = (struct indir *)up;
282 continue;
283 }
284 idx = offset & NMASK;
285 return (cip->array[idx]);
286 }
287 return (0);
288 }
289
290 /*
291 * Write a given sector, prune the tree if the value is 0
292 */
293
294 static int
295 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
296 {
297 struct indir *cip, *lip[10];
298 int idx, li;
299 uintptr_t up;
300
301 if (md_debug > 1)
302 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
303 up = 0;
304 li = 0;
305 cip = ip;
306 for (;;) {
307 lip[li++] = cip;
308 if (cip->shift) {
309 idx = (offset >> cip->shift) & NMASK;
310 up = cip->array[idx];
311 if (up != 0) {
312 cip = (struct indir *)up;
313 continue;
314 }
315 /* Allocate branch */
316 cip->array[idx] =
317 (uintptr_t)new_indir(cip->shift - nshift);
318 if (cip->array[idx] == 0)
319 return (ENOSPC);
320 cip->used++;
321 up = cip->array[idx];
322 cip = (struct indir *)up;
323 continue;
324 }
325 /* leafnode */
326 idx = offset & NMASK;
327 up = cip->array[idx];
328 if (up != 0)
329 cip->used--;
330 cip->array[idx] = ptr;
331 if (ptr != 0)
332 cip->used++;
333 break;
334 }
335 if (cip->used != 0 || li == 1)
336 return (0);
337 li--;
338 while (cip->used == 0 && cip != ip) {
339 li--;
340 idx = (offset >> lip[li]->shift) & NMASK;
341 up = lip[li]->array[idx];
342 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
343 del_indir(cip);
344 lip[li]->array[idx] = 0;
345 lip[li]->used--;
346 cip = lip[li];
347 }
348 return (0);
349 }
350
351
352 static int
353 g_md_access(struct g_provider *pp, int r, int w, int e)
354 {
355 struct md_s *sc;
356
357 sc = pp->geom->softc;
358 if (sc == NULL)
359 return (ENXIO);
360 r += pp->acr;
361 w += pp->acw;
362 e += pp->ace;
363 if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
364 sc->opencount = 1;
365 } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
366 sc->opencount = 0;
367 }
368 return (0);
369 }
370
371 static void
372 g_md_start(struct bio *bp)
373 {
374 struct md_s *sc;
375
376 sc = bp->bio_to->geom->softc;
377
378 bp->bio_pblkno = bp->bio_offset / sc->secsize;
379 bp->bio_bcount = bp->bio_length;
380 mtx_lock(&sc->queue_mtx);
381 bioq_disksort(&sc->bio_queue, bp);
382 mtx_unlock(&sc->queue_mtx);
383
384 wakeup(sc);
385 }
386
387
388
389 static int
390 mdstart_malloc(struct md_s *sc, struct bio *bp)
391 {
392 int i, error;
393 u_char *dst;
394 unsigned secno, nsec, uc;
395 uintptr_t sp, osp;
396
397 nsec = bp->bio_bcount / sc->secsize;
398 secno = bp->bio_pblkno;
399 dst = bp->bio_data;
400 error = 0;
401 while (nsec--) {
402 osp = s_read(sc->indir, secno);
403 if (bp->bio_cmd == BIO_DELETE) {
404 if (osp != 0)
405 error = s_write(sc->indir, secno, 0);
406 } else if (bp->bio_cmd == BIO_READ) {
407 if (osp == 0)
408 bzero(dst, sc->secsize);
409 else if (osp <= 255)
410 for (i = 0; i < sc->secsize; i++)
411 dst[i] = osp;
412 else
413 bcopy((void *)osp, dst, sc->secsize);
414 osp = 0;
415 } else if (bp->bio_cmd == BIO_WRITE) {
416 if (sc->flags & MD_COMPRESS) {
417 uc = dst[0];
418 for (i = 1; i < sc->secsize; i++)
419 if (dst[i] != uc)
420 break;
421 } else {
422 i = 0;
423 uc = 0;
424 }
425 if (i == sc->secsize) {
426 if (osp != uc)
427 error = s_write(sc->indir, secno, uc);
428 } else {
429 if (osp <= 255) {
430 sp = (uintptr_t) uma_zalloc(
431 sc->uma, M_NOWAIT);
432 if (sp == 0) {
433 error = ENOSPC;
434 break;
435 }
436 bcopy(dst, (void *)sp, sc->secsize);
437 error = s_write(sc->indir, secno, sp);
438 } else {
439 bcopy(dst, (void *)osp, sc->secsize);
440 osp = 0;
441 }
442 }
443 } else {
444 error = EOPNOTSUPP;
445 }
446 if (osp > 255)
447 uma_zfree(sc->uma, (void*)osp);
448 if (error)
449 break;
450 secno++;
451 dst += sc->secsize;
452 }
453 bp->bio_resid = 0;
454 return (error);
455 }
456
457 static int
458 mdstart_preload(struct md_s *sc, struct bio *bp)
459 {
460
461 if (bp->bio_cmd == BIO_DELETE) {
462 } else if (bp->bio_cmd == BIO_READ) {
463 bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
464 } else {
465 bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
466 }
467 bp->bio_resid = 0;
468 return (0);
469 }
470
471 static int
472 mdstart_vnode(struct md_s *sc, struct bio *bp)
473 {
474 int error;
475 struct uio auio;
476 struct iovec aiov;
477 struct mount *mp;
478
479 /*
480 * VNODE I/O
481 *
482 * If an error occurs, we set BIO_ERROR but we do not set
483 * B_INVAL because (for a write anyway), the buffer is
484 * still valid.
485 */
486
487 bzero(&auio, sizeof(auio));
488
489 aiov.iov_base = bp->bio_data;
490 aiov.iov_len = bp->bio_bcount;
491 auio.uio_iov = &aiov;
492 auio.uio_iovcnt = 1;
493 auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
494 auio.uio_segflg = UIO_SYSSPACE;
495 if(bp->bio_cmd == BIO_READ)
496 auio.uio_rw = UIO_READ;
497 else if(bp->bio_cmd == BIO_WRITE)
498 auio.uio_rw = UIO_WRITE;
499 else
500 panic("wrong BIO_OP in mdstart_vnode");
501 auio.uio_resid = bp->bio_bcount;
502 auio.uio_td = curthread;
503 /*
504 * When reading set IO_DIRECT to try to avoid double-caching
505 * the data. When writing IO_DIRECT is not optimal.
506 */
507 if (bp->bio_cmd == BIO_READ) {
508 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
509 error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
510 VOP_UNLOCK(sc->vnode, 0, curthread);
511 } else {
512 (void) vn_start_write(sc->vnode, &mp, V_WAIT);
513 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
514 error = VOP_WRITE(sc->vnode, &auio,
515 sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred);
516 VOP_UNLOCK(sc->vnode, 0, curthread);
517 vn_finished_write(mp);
518 }
519 bp->bio_resid = auio.uio_resid;
520 return (error);
521 }
522
523 static int
524 mdstart_swap(struct md_s *sc, struct bio *bp)
525 {
526 struct sf_buf *sf;
527 int i, rv;
528 int offs, len, lastp, lastend;
529 vm_page_t m;
530 u_char *p;
531
532 p = bp->bio_data;
533
534 /*
535 * offs is the ofset at whih to start operating on the
536 * next (ie, first) page. lastp is the last page on
537 * which we're going to operate. lastend is the ending
538 * position within that last page (ie, PAGE_SIZE if
539 * we're operating on complete aligned pages).
540 */
541 offs = bp->bio_offset % PAGE_SIZE;
542 lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
543 lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
544
545 VM_OBJECT_LOCK(sc->object);
546 vm_object_pip_add(sc->object, 1);
547 for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
548 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
549
550 m = vm_page_grab(sc->object, i,
551 VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
552 VM_OBJECT_UNLOCK(sc->object);
553 sf = sf_buf_alloc(m, 0);
554 VM_OBJECT_LOCK(sc->object);
555 if (bp->bio_cmd == BIO_READ) {
556 if (m->valid != VM_PAGE_BITS_ALL)
557 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
558 bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
559 } else if (bp->bio_cmd == BIO_WRITE) {
560 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
561 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
562 bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
563 m->valid = VM_PAGE_BITS_ALL;
564 #if 0
565 } else if (bp->bio_cmd == BIO_DELETE) {
566 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
567 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
568 bzero((void *)(sf_buf_kva(sf) + offs), len);
569 vm_page_dirty(m);
570 m->valid = VM_PAGE_BITS_ALL;
571 #endif
572 }
573 sf_buf_free(sf);
574 vm_page_lock_queues();
575 vm_page_wakeup(m);
576 vm_page_activate(m);
577 if (bp->bio_cmd == BIO_WRITE)
578 vm_page_dirty(m);
579 vm_page_unlock_queues();
580
581 /* Actions on further pages start at offset 0 */
582 p += PAGE_SIZE - offs;
583 offs = 0;
584 #if 0
585 if (bootverbose || bp->bio_offset / PAGE_SIZE < 17)
586 printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n",
587 m->wire_count, m->busy,
588 m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i);
589 #endif
590 }
591 vm_object_pip_subtract(sc->object, 1);
592 vm_object_set_writeable_dirty(sc->object);
593 VM_OBJECT_UNLOCK(sc->object);
594 return (0);
595 }
596
597 static void
598 md_kthread(void *arg)
599 {
600 struct md_s *sc;
601 struct bio *bp;
602 int error, hasgiant;
603
604 sc = arg;
605 curthread->td_base_pri = PRIBIO;
606
607 switch (sc->type) {
608 case MD_VNODE:
609 mtx_lock(&Giant);
610 hasgiant = 1;
611 break;
612 case MD_MALLOC:
613 case MD_PRELOAD:
614 case MD_SWAP:
615 default:
616 hasgiant = 0;
617 break;
618 }
619
620 for (;;) {
621 mtx_lock(&sc->queue_mtx);
622 bp = bioq_first(&sc->bio_queue);
623 if (bp)
624 bioq_remove(&sc->bio_queue, bp);
625 if (!bp) {
626 if (sc->flags & MD_SHUTDOWN) {
627 mtx_unlock(&sc->queue_mtx);
628 sc->procp = NULL;
629 wakeup(&sc->procp);
630 if (hasgiant)
631 mtx_unlock(&Giant);
632 kthread_exit(0);
633 }
634 msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
635 continue;
636 }
637 mtx_unlock(&sc->queue_mtx);
638 if (bp->bio_cmd == BIO_GETATTR) {
639 if (sc->fwsectors && sc->fwheads &&
640 (g_handleattr_int(bp, "GEOM::fwsectors",
641 sc->fwsectors) ||
642 g_handleattr_int(bp, "GEOM::fwheads",
643 sc->fwheads)))
644 error = -1;
645 else
646 error = EOPNOTSUPP;
647 } else {
648 switch (sc->type) {
649 case MD_MALLOC:
650 error = mdstart_malloc(sc, bp);
651 break;
652 case MD_PRELOAD:
653 error = mdstart_preload(sc, bp);
654 break;
655 case MD_VNODE:
656 error = mdstart_vnode(sc, bp);
657 break;
658 case MD_SWAP:
659 error = mdstart_swap(sc, bp);
660 break;
661 default:
662 panic("Impossible md(type)");
663 break;
664 }
665 }
666
667 if (error != -1) {
668 bp->bio_completed = bp->bio_length;
669 g_io_deliver(bp, error);
670 }
671 }
672 }
673
674 static struct md_s *
675 mdfind(int unit)
676 {
677 struct md_s *sc;
678
679 /* XXX: LOCK(unique unit numbers) */
680 LIST_FOREACH(sc, &md_softc_list, list) {
681 if (sc->unit == unit)
682 break;
683 }
684 /* XXX: UNLOCK(unique unit numbers) */
685 return (sc);
686 }
687
688 static struct md_s *
689 mdnew(int unit)
690 {
691 struct md_s *sc;
692 int error, max = -1;
693
694 /* XXX: LOCK(unique unit numbers) */
695 LIST_FOREACH(sc, &md_softc_list, list) {
696 if (sc->unit == unit) {
697 /* XXX: UNLOCK(unique unit numbers) */
698 return (NULL);
699 }
700 if (sc->unit > max)
701 max = sc->unit;
702 }
703 if (unit == -1)
704 unit = max + 1;
705 sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
706 sc->unit = unit;
707 bioq_init(&sc->bio_queue);
708 mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
709 sprintf(sc->name, "md%d", unit);
710 error = kthread_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
711 if (error) {
712 free(sc, M_MD);
713 return (NULL);
714 }
715 LIST_INSERT_HEAD(&md_softc_list, sc, list);
716 /* XXX: UNLOCK(unique unit numbers) */
717 return (sc);
718 }
719
720 static void
721 mdinit(struct md_s *sc)
722 {
723
724 struct g_geom *gp;
725 struct g_provider *pp;
726
727 DROP_GIANT();
728 g_topology_lock();
729 gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
730 gp->softc = sc;
731 pp = g_new_providerf(gp, "md%d", sc->unit);
732 pp->mediasize = (off_t)sc->nsect * sc->secsize;
733 pp->sectorsize = sc->secsize;
734 sc->gp = gp;
735 sc->pp = pp;
736 g_error_provider(pp, 0);
737 g_topology_unlock();
738 PICKUP_GIANT();
739 }
740
741 /*
742 * XXX: we should check that the range they feed us is mapped.
743 * XXX: we should implement read-only.
744 */
745
746 static int
747 mdcreate_preload(struct md_ioctl *mdio)
748 {
749 struct md_s *sc;
750
751 if (mdio->md_size == 0)
752 return (EINVAL);
753 if (mdio->md_options & ~(MD_AUTOUNIT))
754 return (EINVAL);
755 if (mdio->md_options & MD_AUTOUNIT) {
756 sc = mdnew(-1);
757 if (sc == NULL)
758 return (ENOMEM);
759 mdio->md_unit = sc->unit;
760 } else {
761 sc = mdnew(mdio->md_unit);
762 if (sc == NULL)
763 return (EBUSY);
764 }
765 sc->type = MD_PRELOAD;
766 sc->secsize = DEV_BSIZE;
767 sc->nsect = mdio->md_size;
768 sc->flags = mdio->md_options & MD_FORCE;
769 /* Cast to pointer size, then to pointer to avoid warning */
770 sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
771 sc->pl_len = (mdio->md_size << DEV_BSHIFT);
772 mdinit(sc);
773 return (0);
774 }
775
776
777 static int
778 mdcreate_malloc(struct md_ioctl *mdio)
779 {
780 struct md_s *sc;
781 off_t u;
782 uintptr_t sp;
783 int error;
784
785 error = 0;
786 if (mdio->md_size == 0)
787 return (EINVAL);
788 if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
789 return (EINVAL);
790 if (mdio->md_secsize != 0 && !powerof2(mdio->md_secsize))
791 return (EINVAL);
792 /* Compression doesn't make sense if we have reserved space */
793 if (mdio->md_options & MD_RESERVE)
794 mdio->md_options &= ~MD_COMPRESS;
795 if (mdio->md_options & MD_AUTOUNIT) {
796 sc = mdnew(-1);
797 if (sc == NULL)
798 return (ENOMEM);
799 mdio->md_unit = sc->unit;
800 } else {
801 sc = mdnew(mdio->md_unit);
802 if (sc == NULL)
803 return (EBUSY);
804 }
805 sc->type = MD_MALLOC;
806 if (mdio->md_secsize != 0)
807 sc->secsize = mdio->md_secsize;
808 else
809 sc->secsize = DEV_BSIZE;
810 if (mdio->md_fwsectors != 0)
811 sc->fwsectors = mdio->md_fwsectors;
812 if (mdio->md_fwheads != 0)
813 sc->fwheads = mdio->md_fwheads;
814 sc->nsect = (mdio->md_size * DEV_BSIZE) / sc->secsize;
815 sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
816 sc->indir = dimension(sc->nsect);
817 sc->uma = uma_zcreate(sc->name, sc->secsize,
818 NULL, NULL, NULL, NULL, 0x1ff, 0);
819 if (mdio->md_options & MD_RESERVE) {
820 for (u = 0; u < sc->nsect; u++) {
821 sp = (uintptr_t) uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
822 if (sp != 0)
823 error = s_write(sc->indir, u, sp);
824 else
825 error = ENOMEM;
826 if (error)
827 break;
828 }
829 }
830 if (error) {
831 mddestroy(sc, NULL);
832 return (error);
833 }
834 mdinit(sc);
835 if (!(mdio->md_options & MD_RESERVE))
836 sc->pp->flags |= G_PF_CANDELETE;
837 return (0);
838 }
839
840
841 static int
842 mdsetcred(struct md_s *sc, struct ucred *cred)
843 {
844 char *tmpbuf;
845 int error = 0;
846
847 /*
848 * Set credits in our softc
849 */
850
851 if (sc->cred)
852 crfree(sc->cred);
853 sc->cred = crhold(cred);
854
855 /*
856 * Horrible kludge to establish credentials for NFS XXX.
857 */
858
859 if (sc->vnode) {
860 struct uio auio;
861 struct iovec aiov;
862
863 tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
864 bzero(&auio, sizeof(auio));
865
866 aiov.iov_base = tmpbuf;
867 aiov.iov_len = sc->secsize;
868 auio.uio_iov = &aiov;
869 auio.uio_iovcnt = 1;
870 auio.uio_offset = 0;
871 auio.uio_rw = UIO_READ;
872 auio.uio_segflg = UIO_SYSSPACE;
873 auio.uio_resid = aiov.iov_len;
874 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
875 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
876 VOP_UNLOCK(sc->vnode, 0, curthread);
877 free(tmpbuf, M_TEMP);
878 }
879 return (error);
880 }
881
882 static int
883 mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
884 {
885 struct md_s *sc;
886 struct vattr vattr;
887 struct nameidata nd;
888 int error, flags;
889
890 flags = FREAD|FWRITE;
891 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
892 error = vn_open(&nd, &flags, 0, -1);
893 if (error) {
894 if (error != EACCES && error != EPERM && error != EROFS)
895 return (error);
896 flags &= ~FWRITE;
897 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
898 error = vn_open(&nd, &flags, 0, -1);
899 if (error)
900 return (error);
901 }
902 NDFREE(&nd, NDF_ONLY_PNBUF);
903 if (nd.ni_vp->v_type != VREG ||
904 (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
905 VOP_UNLOCK(nd.ni_vp, 0, td);
906 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
907 return (error ? error : EINVAL);
908 }
909 VOP_UNLOCK(nd.ni_vp, 0, td);
910
911 if (mdio->md_options & MD_AUTOUNIT) {
912 sc = mdnew(-1);
913 mdio->md_unit = sc->unit;
914 } else {
915 sc = mdnew(mdio->md_unit);
916 }
917 if (sc == NULL) {
918 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
919 return (EBUSY);
920 }
921
922 if (mdio->md_fwsectors != 0)
923 sc->fwsectors = mdio->md_fwsectors;
924 if (mdio->md_fwheads != 0)
925 sc->fwheads = mdio->md_fwheads;
926 sc->type = MD_VNODE;
927 sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
928 if (!(flags & FWRITE))
929 sc->flags |= MD_READONLY;
930 sc->secsize = DEV_BSIZE;
931 sc->vnode = nd.ni_vp;
932
933 /*
934 * If the size is specified, override the file attributes.
935 */
936 if (mdio->md_size)
937 sc->nsect = mdio->md_size;
938 else
939 sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
940 if (sc->nsect == 0) {
941 mddestroy(sc, td);
942 return (EINVAL);
943 }
944 error = mdsetcred(sc, td->td_ucred);
945 if (error) {
946 mddestroy(sc, td);
947 return (error);
948 }
949 mdinit(sc);
950 return (0);
951 }
952
953 static void
954 md_zapit(void *p, int cancel)
955 {
956 if (cancel)
957 return;
958 g_wither_geom(p, ENXIO);
959 }
960
961 static int
962 mddestroy(struct md_s *sc, struct thread *td)
963 {
964
965 GIANT_REQUIRED;
966
967 mtx_destroy(&sc->queue_mtx);
968 if (sc->gp) {
969 sc->gp->softc = NULL;
970 g_waitfor_event(md_zapit, sc->gp, M_WAITOK, sc->gp, NULL);
971 sc->gp = NULL;
972 sc->pp = NULL;
973 }
974 sc->flags |= MD_SHUTDOWN;
975 wakeup(sc);
976 while (sc->procp != NULL)
977 tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10);
978 if (sc->vnode != NULL)
979 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
980 FREAD : (FREAD|FWRITE), sc->cred, td);
981 if (sc->cred != NULL)
982 crfree(sc->cred);
983 if (sc->object != NULL) {
984 vm_object_deallocate(sc->object);
985 }
986 if (sc->indir)
987 destroy_indir(sc, sc->indir);
988 if (sc->uma)
989 uma_zdestroy(sc->uma);
990
991 /* XXX: LOCK(unique unit numbers) */
992 LIST_REMOVE(sc, list);
993 /* XXX: UNLOCK(unique unit numbers) */
994 free(sc, M_MD);
995 return (0);
996 }
997
998 static int
999 mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
1000 {
1001 int error;
1002 struct md_s *sc;
1003
1004 GIANT_REQUIRED;
1005
1006 if (mdio->md_options & MD_AUTOUNIT) {
1007 sc = mdnew(-1);
1008 mdio->md_unit = sc->unit;
1009 } else {
1010 sc = mdnew(mdio->md_unit);
1011 }
1012 if (sc == NULL)
1013 return (EBUSY);
1014
1015 sc->type = MD_SWAP;
1016
1017 /*
1018 * Range check. Disallow negative sizes or any size less then the
1019 * size of a page. Then round to a page.
1020 */
1021
1022 if (mdio->md_size == 0) {
1023 mddestroy(sc, td);
1024 return (EDOM);
1025 }
1026
1027 /*
1028 * Allocate an OBJT_SWAP object.
1029 *
1030 * sc_nsect is in units of DEV_BSIZE.
1031 * sc_npage is in units of PAGE_SIZE.
1032 *
1033 * Note the truncation.
1034 */
1035
1036 sc->secsize = DEV_BSIZE;
1037 sc->npage = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
1038 sc->nsect = sc->npage * (PAGE_SIZE / DEV_BSIZE);
1039 if (mdio->md_fwsectors != 0)
1040 sc->fwsectors = mdio->md_fwsectors;
1041 if (mdio->md_fwheads != 0)
1042 sc->fwheads = mdio->md_fwheads;
1043 sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE *
1044 (vm_offset_t)sc->npage, VM_PROT_DEFAULT, 0);
1045 sc->flags = mdio->md_options & MD_FORCE;
1046 if (mdio->md_options & MD_RESERVE) {
1047 if (swap_pager_reserve(sc->object, 0, sc->npage) < 0) {
1048 vm_object_deallocate(sc->object);
1049 sc->object = NULL;
1050 mddestroy(sc, td);
1051 return (EDOM);
1052 }
1053 }
1054 error = mdsetcred(sc, td->td_ucred);
1055 if (error) {
1056 mddestroy(sc, td);
1057 return (error);
1058 }
1059 mdinit(sc);
1060 if (!(mdio->md_options & MD_RESERVE))
1061 sc->pp->flags |= G_PF_CANDELETE;
1062 return (0);
1063 }
1064
1065 static int
1066 mddetach(int unit, struct thread *td)
1067 {
1068 struct md_s *sc;
1069
1070 sc = mdfind(unit);
1071 if (sc == NULL)
1072 return (ENOENT);
1073 if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
1074 return (EBUSY);
1075 switch(sc->type) {
1076 case MD_VNODE:
1077 case MD_SWAP:
1078 case MD_MALLOC:
1079 case MD_PRELOAD:
1080 return (mddestroy(sc, td));
1081 default:
1082 return (EOPNOTSUPP);
1083 }
1084 }
1085
1086 static int
1087 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1088 {
1089 struct md_ioctl *mdio;
1090 struct md_s *sc;
1091 int i;
1092
1093 if (md_debug)
1094 printf("mdctlioctl(%s %lx %p %x %p)\n",
1095 devtoname(dev), cmd, addr, flags, td);
1096
1097 /*
1098 * We assert the version number in the individual ioctl
1099 * handlers instead of out here because (a) it is possible we
1100 * may add another ioctl in the future which doesn't read an
1101 * mdio, and (b) the correct return value for an unknown ioctl
1102 * is ENOIOCTL, not EINVAL.
1103 */
1104 mdio = (struct md_ioctl *)addr;
1105 switch (cmd) {
1106 case MDIOCATTACH:
1107 if (mdio->md_version != MDIOVERSION)
1108 return (EINVAL);
1109 switch (mdio->md_type) {
1110 case MD_MALLOC:
1111 return (mdcreate_malloc(mdio));
1112 case MD_PRELOAD:
1113 return (mdcreate_preload(mdio));
1114 case MD_VNODE:
1115 return (mdcreate_vnode(mdio, td));
1116 case MD_SWAP:
1117 return (mdcreate_swap(mdio, td));
1118 default:
1119 return (EINVAL);
1120 }
1121 case MDIOCDETACH:
1122 if (mdio->md_version != MDIOVERSION)
1123 return (EINVAL);
1124 if (mdio->md_file != NULL || mdio->md_size != 0 ||
1125 mdio->md_options != 0)
1126 return (EINVAL);
1127 return (mddetach(mdio->md_unit, td));
1128 case MDIOCQUERY:
1129 if (mdio->md_version != MDIOVERSION)
1130 return (EINVAL);
1131 sc = mdfind(mdio->md_unit);
1132 if (sc == NULL)
1133 return (ENOENT);
1134 mdio->md_type = sc->type;
1135 mdio->md_options = sc->flags;
1136 switch (sc->type) {
1137 case MD_MALLOC:
1138 mdio->md_size = sc->nsect;
1139 break;
1140 case MD_PRELOAD:
1141 mdio->md_size = sc->nsect;
1142 mdio->md_base = (uint64_t)(intptr_t)sc->pl_ptr;
1143 break;
1144 case MD_SWAP:
1145 mdio->md_size = sc->nsect;
1146 break;
1147 case MD_VNODE:
1148 mdio->md_size = sc->nsect;
1149 /* XXX fill this in */
1150 mdio->md_file = NULL;
1151 break;
1152 }
1153 return (0);
1154 case MDIOCLIST:
1155 i = 1;
1156 LIST_FOREACH(sc, &md_softc_list, list) {
1157 if (i == MDNPAD - 1)
1158 mdio->md_pad[i] = -1;
1159 else
1160 mdio->md_pad[i++] = sc->unit;
1161 }
1162 mdio->md_pad[0] = i - 1;
1163 return (0);
1164 default:
1165 return (ENOIOCTL);
1166 };
1167 return (ENOIOCTL);
1168 }
1169
1170 static void
1171 md_preloaded(u_char *image, unsigned length)
1172 {
1173 struct md_s *sc;
1174
1175 sc = mdnew(-1);
1176 if (sc == NULL)
1177 return;
1178 sc->type = MD_PRELOAD;
1179 sc->secsize = DEV_BSIZE;
1180 sc->nsect = length / DEV_BSIZE;
1181 sc->pl_ptr = image;
1182 sc->pl_len = length;
1183 #ifdef MD_ROOT
1184 if (sc->unit == 0)
1185 rootdevnames[0] = "ufs:/dev/md0";
1186 #endif
1187 mdinit(sc);
1188 }
1189
1190 static void
1191 g_md_init(struct g_class *mp __unused)
1192 {
1193
1194 caddr_t mod;
1195 caddr_t c;
1196 u_char *ptr, *name, *type;
1197 unsigned len;
1198
1199 mod = NULL;
1200 g_topology_unlock();
1201 #ifdef MD_ROOT_SIZE
1202 md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
1203 #endif
1204 while ((mod = preload_search_next_name(mod)) != NULL) {
1205 name = (char *)preload_search_info(mod, MODINFO_NAME);
1206 type = (char *)preload_search_info(mod, MODINFO_TYPE);
1207 if (name == NULL)
1208 continue;
1209 if (type == NULL)
1210 continue;
1211 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
1212 continue;
1213 c = preload_search_info(mod, MODINFO_ADDR);
1214 ptr = *(u_char **)c;
1215 c = preload_search_info(mod, MODINFO_SIZE);
1216 len = *(size_t *)c;
1217 printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
1218 MD_NAME, mdunits, name, len, ptr);
1219 md_preloaded(ptr, len);
1220 }
1221 status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
1222 0600, MDCTL_NAME);
1223 g_topology_lock();
1224 }
1225
1226 static void
1227 g_md_fini(struct g_class *mp __unused)
1228 {
1229
1230 if (status_dev != NULL)
1231 destroy_dev(status_dev);
1232 }
Cache object: 06e7813e5d0ebeb9df446aad96837147
|