FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c
1 /*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: releng/6.0/sys/dev/md/md.c 151065 2005-10-07 15:08:41Z phk $
10 *
11 */
12
13 /*-
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 * The Regents of the University of California. All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 4. Neither the name of the University nor the names of its contributors
35 * may be used to endorse or promote products derived from this software
36 * without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 *
50 * from: Utah Hdr: vn.c 1.13 94/04/02
51 *
52 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94
53 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
54 */
55
56 #include "opt_geom.h"
57 #include "opt_md.h"
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/conf.h>
63 #include <sys/fcntl.h>
64 #include <sys/kernel.h>
65 #include <sys/kthread.h>
66 #include <sys/linker.h>
67 #include <sys/lock.h>
68 #include <sys/malloc.h>
69 #include <sys/mdioctl.h>
70 #include <sys/mutex.h>
71 #include <sys/sx.h>
72 #include <sys/namei.h>
73 #include <sys/proc.h>
74 #include <sys/queue.h>
75 #include <sys/sched.h>
76 #include <sys/sf_buf.h>
77 #include <sys/sysctl.h>
78 #include <sys/vnode.h>
79
80 #include <geom/geom.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pager.h>
86 #include <vm/swap_pager.h>
87 #include <vm/uma.h>
88
89 #define MD_MODVER 1
90
91 #define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */
92
93 #ifndef MD_NSECT
94 #define MD_NSECT (10000 * 2)
95 #endif
96
97 static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
98 static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
99
100 static int md_debug;
101 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
102
103 #if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
104 /* Image gets put here: */
105 static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
106 static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
107 #endif
108
109 static g_init_t g_md_init;
110 static g_fini_t g_md_fini;
111 static g_start_t g_md_start;
112 static g_access_t g_md_access;
113
114 static int mdunits;
115 static struct cdev *status_dev = 0;
116 static struct sx md_sx;
117
118 static d_ioctl_t mdctlioctl;
119
120 static struct cdevsw mdctl_cdevsw = {
121 .d_version = D_VERSION,
122 .d_ioctl = mdctlioctl,
123 .d_name = MD_NAME,
124 };
125
126 struct g_class g_md_class = {
127 .name = "MD",
128 .version = G_VERSION,
129 .init = g_md_init,
130 .fini = g_md_fini,
131 .start = g_md_start,
132 .access = g_md_access,
133 };
134
135 DECLARE_GEOM_CLASS(g_md_class, g_md);
136
137
138 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
139
140 #define NINDIR (PAGE_SIZE / sizeof(uintptr_t))
141 #define NMASK (NINDIR-1)
142 static int nshift;
143
144 struct indir {
145 uintptr_t *array;
146 u_int total;
147 u_int used;
148 u_int shift;
149 };
150
151 struct md_s {
152 int unit;
153 LIST_ENTRY(md_s) list;
154 struct bio_queue_head bio_queue;
155 struct mtx queue_mtx;
156 struct cdev *dev;
157 enum md_types type;
158 off_t mediasize;
159 unsigned sectorsize;
160 unsigned opencount;
161 unsigned fwheads;
162 unsigned fwsectors;
163 unsigned flags;
164 char name[20];
165 struct proc *procp;
166 struct g_geom *gp;
167 struct g_provider *pp;
168 int (*start)(struct md_s *sc, struct bio *bp);
169
170 /* MD_MALLOC related fields */
171 struct indir *indir;
172 uma_zone_t uma;
173
174 /* MD_PRELOAD related fields */
175 u_char *pl_ptr;
176 size_t pl_len;
177
178 /* MD_VNODE related fields */
179 struct vnode *vnode;
180 char file[PATH_MAX];
181 struct ucred *cred;
182
183 /* MD_SWAP related fields */
184 vm_object_t object;
185 };
186
187 static struct indir *
188 new_indir(u_int shift)
189 {
190 struct indir *ip;
191
192 ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
193 if (ip == NULL)
194 return (NULL);
195 ip->array = malloc(sizeof(uintptr_t) * NINDIR,
196 M_MDSECT, M_NOWAIT | M_ZERO);
197 if (ip->array == NULL) {
198 free(ip, M_MD);
199 return (NULL);
200 }
201 ip->total = NINDIR;
202 ip->shift = shift;
203 return (ip);
204 }
205
206 static void
207 del_indir(struct indir *ip)
208 {
209
210 free(ip->array, M_MDSECT);
211 free(ip, M_MD);
212 }
213
214 static void
215 destroy_indir(struct md_s *sc, struct indir *ip)
216 {
217 int i;
218
219 for (i = 0; i < NINDIR; i++) {
220 if (!ip->array[i])
221 continue;
222 if (ip->shift)
223 destroy_indir(sc, (struct indir*)(ip->array[i]));
224 else if (ip->array[i] > 255)
225 uma_zfree(sc->uma, (void *)(ip->array[i]));
226 }
227 del_indir(ip);
228 }
229
230 /*
231 * This function does the math and alloctes the top level "indir" structure
232 * for a device of "size" sectors.
233 */
234
235 static struct indir *
236 dimension(off_t size)
237 {
238 off_t rcnt;
239 struct indir *ip;
240 int i, layer;
241
242 rcnt = size;
243 layer = 0;
244 while (rcnt > NINDIR) {
245 rcnt /= NINDIR;
246 layer++;
247 }
248 /* figure out log2(NINDIR) */
249 for (i = NINDIR, nshift = -1; i; nshift++)
250 i >>= 1;
251
252 /*
253 * XXX: the top layer is probably not fully populated, so we allocate
254 * too much space for ip->array in here.
255 */
256 ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
257 ip->array = malloc(sizeof(uintptr_t) * NINDIR,
258 M_MDSECT, M_WAITOK | M_ZERO);
259 ip->total = NINDIR;
260 ip->shift = layer * nshift;
261 return (ip);
262 }
263
264 /*
265 * Read a given sector
266 */
267
268 static uintptr_t
269 s_read(struct indir *ip, off_t offset)
270 {
271 struct indir *cip;
272 int idx;
273 uintptr_t up;
274
275 if (md_debug > 1)
276 printf("s_read(%jd)\n", (intmax_t)offset);
277 up = 0;
278 for (cip = ip; cip != NULL;) {
279 if (cip->shift) {
280 idx = (offset >> cip->shift) & NMASK;
281 up = cip->array[idx];
282 cip = (struct indir *)up;
283 continue;
284 }
285 idx = offset & NMASK;
286 return (cip->array[idx]);
287 }
288 return (0);
289 }
290
291 /*
292 * Write a given sector, prune the tree if the value is 0
293 */
294
295 static int
296 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
297 {
298 struct indir *cip, *lip[10];
299 int idx, li;
300 uintptr_t up;
301
302 if (md_debug > 1)
303 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
304 up = 0;
305 li = 0;
306 cip = ip;
307 for (;;) {
308 lip[li++] = cip;
309 if (cip->shift) {
310 idx = (offset >> cip->shift) & NMASK;
311 up = cip->array[idx];
312 if (up != 0) {
313 cip = (struct indir *)up;
314 continue;
315 }
316 /* Allocate branch */
317 cip->array[idx] =
318 (uintptr_t)new_indir(cip->shift - nshift);
319 if (cip->array[idx] == 0)
320 return (ENOSPC);
321 cip->used++;
322 up = cip->array[idx];
323 cip = (struct indir *)up;
324 continue;
325 }
326 /* leafnode */
327 idx = offset & NMASK;
328 up = cip->array[idx];
329 if (up != 0)
330 cip->used--;
331 cip->array[idx] = ptr;
332 if (ptr != 0)
333 cip->used++;
334 break;
335 }
336 if (cip->used != 0 || li == 1)
337 return (0);
338 li--;
339 while (cip->used == 0 && cip != ip) {
340 li--;
341 idx = (offset >> lip[li]->shift) & NMASK;
342 up = lip[li]->array[idx];
343 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
344 del_indir(cip);
345 lip[li]->array[idx] = 0;
346 lip[li]->used--;
347 cip = lip[li];
348 }
349 return (0);
350 }
351
352
353 static int
354 g_md_access(struct g_provider *pp, int r, int w, int e)
355 {
356 struct md_s *sc;
357
358 sc = pp->geom->softc;
359 if (sc == NULL)
360 return (ENXIO);
361 r += pp->acr;
362 w += pp->acw;
363 e += pp->ace;
364 if ((sc->flags & MD_READONLY) != 0 && w > 0)
365 return (EROFS);
366 if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
367 sc->opencount = 1;
368 } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
369 sc->opencount = 0;
370 }
371 return (0);
372 }
373
374 static void
375 g_md_start(struct bio *bp)
376 {
377 struct md_s *sc;
378
379 sc = bp->bio_to->geom->softc;
380 mtx_lock(&sc->queue_mtx);
381 bioq_disksort(&sc->bio_queue, bp);
382 mtx_unlock(&sc->queue_mtx);
383 wakeup(sc);
384 }
385
386 static int
387 mdstart_malloc(struct md_s *sc, struct bio *bp)
388 {
389 int i, error;
390 u_char *dst;
391 off_t secno, nsec, uc;
392 uintptr_t sp, osp;
393
394 nsec = bp->bio_length / sc->sectorsize;
395 secno = bp->bio_offset / sc->sectorsize;
396 dst = bp->bio_data;
397 error = 0;
398 while (nsec--) {
399 osp = s_read(sc->indir, secno);
400 if (bp->bio_cmd == BIO_DELETE) {
401 if (osp != 0)
402 error = s_write(sc->indir, secno, 0);
403 } else if (bp->bio_cmd == BIO_READ) {
404 if (osp == 0)
405 bzero(dst, sc->sectorsize);
406 else if (osp <= 255)
407 for (i = 0; i < sc->sectorsize; i++)
408 dst[i] = osp;
409 else
410 bcopy((void *)osp, dst, sc->sectorsize);
411 osp = 0;
412 } else if (bp->bio_cmd == BIO_WRITE) {
413 if (sc->flags & MD_COMPRESS) {
414 uc = dst[0];
415 for (i = 1; i < sc->sectorsize; i++)
416 if (dst[i] != uc)
417 break;
418 } else {
419 i = 0;
420 uc = 0;
421 }
422 if (i == sc->sectorsize) {
423 if (osp != uc)
424 error = s_write(sc->indir, secno, uc);
425 } else {
426 if (osp <= 255) {
427 sp = (uintptr_t)uma_zalloc(sc->uma,
428 M_NOWAIT);
429 if (sp == 0) {
430 error = ENOSPC;
431 break;
432 }
433 bcopy(dst, (void *)sp, sc->sectorsize);
434 error = s_write(sc->indir, secno, sp);
435 } else {
436 bcopy(dst, (void *)osp, sc->sectorsize);
437 osp = 0;
438 }
439 }
440 } else {
441 error = EOPNOTSUPP;
442 }
443 if (osp > 255)
444 uma_zfree(sc->uma, (void*)osp);
445 if (error != 0)
446 break;
447 secno++;
448 dst += sc->sectorsize;
449 }
450 bp->bio_resid = 0;
451 return (error);
452 }
453
454 static int
455 mdstart_preload(struct md_s *sc, struct bio *bp)
456 {
457
458 switch (bp->bio_cmd) {
459 case BIO_READ:
460 bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data,
461 bp->bio_length);
462 break;
463 case BIO_WRITE:
464 bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset,
465 bp->bio_length);
466 break;
467 }
468 bp->bio_resid = 0;
469 return (0);
470 }
471
472 static int
473 mdstart_vnode(struct md_s *sc, struct bio *bp)
474 {
475 int error;
476 struct uio auio;
477 struct iovec aiov;
478 struct mount *mp;
479
480 mtx_assert(&Giant, MA_OWNED);
481 /*
482 * VNODE I/O
483 *
484 * If an error occurs, we set BIO_ERROR but we do not set
485 * B_INVAL because (for a write anyway), the buffer is
486 * still valid.
487 */
488
489 bzero(&auio, sizeof(auio));
490
491 aiov.iov_base = bp->bio_data;
492 aiov.iov_len = bp->bio_length;
493 auio.uio_iov = &aiov;
494 auio.uio_iovcnt = 1;
495 auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
496 auio.uio_segflg = UIO_SYSSPACE;
497 if(bp->bio_cmd == BIO_READ)
498 auio.uio_rw = UIO_READ;
499 else if(bp->bio_cmd == BIO_WRITE)
500 auio.uio_rw = UIO_WRITE;
501 else
502 panic("wrong BIO_OP in mdstart_vnode");
503 auio.uio_resid = bp->bio_length;
504 auio.uio_td = curthread;
505 /*
506 * When reading set IO_DIRECT to try to avoid double-caching
507 * the data. When writing IO_DIRECT is not optimal.
508 */
509 if (bp->bio_cmd == BIO_READ) {
510 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
511 error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
512 VOP_UNLOCK(sc->vnode, 0, curthread);
513 } else {
514 (void) vn_start_write(sc->vnode, &mp, V_WAIT);
515 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
516 error = VOP_WRITE(sc->vnode, &auio,
517 sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred);
518 VOP_UNLOCK(sc->vnode, 0, curthread);
519 vn_finished_write(mp);
520 }
521 bp->bio_resid = auio.uio_resid;
522 return (error);
523 }
524
525 static int
526 mdstart_swap(struct md_s *sc, struct bio *bp)
527 {
528 struct sf_buf *sf;
529 int rv, offs, len, lastend;
530 vm_pindex_t i, lastp;
531 vm_page_t m;
532 u_char *p;
533
534 p = bp->bio_data;
535
536 /*
537 * offs is the ofset at whih to start operating on the
538 * next (ie, first) page. lastp is the last page on
539 * which we're going to operate. lastend is the ending
540 * position within that last page (ie, PAGE_SIZE if
541 * we're operating on complete aligned pages).
542 */
543 offs = bp->bio_offset % PAGE_SIZE;
544 lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
545 lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
546
547 rv = VM_PAGER_OK;
548 VM_OBJECT_LOCK(sc->object);
549 vm_object_pip_add(sc->object, 1);
550 for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
551 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
552
553 m = vm_page_grab(sc->object, i,
554 VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
555 VM_OBJECT_UNLOCK(sc->object);
556 sched_pin();
557 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
558 VM_OBJECT_LOCK(sc->object);
559 if (bp->bio_cmd == BIO_READ) {
560 if (m->valid != VM_PAGE_BITS_ALL)
561 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
562 if (rv == VM_PAGER_ERROR) {
563 sf_buf_free(sf);
564 sched_unpin();
565 vm_page_lock_queues();
566 vm_page_wakeup(m);
567 vm_page_unlock_queues();
568 break;
569 }
570 bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
571 } else if (bp->bio_cmd == BIO_WRITE) {
572 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
573 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
574 if (rv == VM_PAGER_ERROR) {
575 sf_buf_free(sf);
576 sched_unpin();
577 vm_page_lock_queues();
578 vm_page_wakeup(m);
579 vm_page_unlock_queues();
580 break;
581 }
582 bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
583 m->valid = VM_PAGE_BITS_ALL;
584 #if 0
585 } else if (bp->bio_cmd == BIO_DELETE) {
586 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
587 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
588 if (rv == VM_PAGER_ERROR) {
589 sf_buf_free(sf);
590 sched_unpin();
591 vm_page_lock_queues();
592 vm_page_wakeup(m);
593 vm_page_unlock_queues();
594 break;
595 }
596 bzero((void *)(sf_buf_kva(sf) + offs), len);
597 vm_page_dirty(m);
598 m->valid = VM_PAGE_BITS_ALL;
599 #endif
600 }
601 sf_buf_free(sf);
602 sched_unpin();
603 vm_page_lock_queues();
604 vm_page_wakeup(m);
605 vm_page_activate(m);
606 if (bp->bio_cmd == BIO_WRITE)
607 vm_page_dirty(m);
608 vm_page_unlock_queues();
609
610 /* Actions on further pages start at offset 0 */
611 p += PAGE_SIZE - offs;
612 offs = 0;
613 #if 0
614 if (bootverbose || bp->bio_offset / PAGE_SIZE < 17)
615 printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n",
616 m->wire_count, m->busy,
617 m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i);
618 #endif
619 }
620 vm_object_pip_subtract(sc->object, 1);
621 vm_object_set_writeable_dirty(sc->object);
622 VM_OBJECT_UNLOCK(sc->object);
623 return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
624 }
625
626 static void
627 md_kthread(void *arg)
628 {
629 struct md_s *sc;
630 struct bio *bp;
631 int error, hasgiant;
632
633 sc = arg;
634 mtx_lock_spin(&sched_lock);
635 sched_prio(curthread, PRIBIO);
636 mtx_unlock_spin(&sched_lock);
637
638 switch (sc->type) {
639 case MD_VNODE:
640 mtx_lock(&Giant);
641 hasgiant = 1;
642 break;
643 case MD_MALLOC:
644 case MD_PRELOAD:
645 case MD_SWAP:
646 default:
647 hasgiant = 0;
648 break;
649 }
650
651 for (;;) {
652 if (sc->flags & MD_SHUTDOWN) {
653 sc->procp = NULL;
654 wakeup(&sc->procp);
655 if (hasgiant)
656 mtx_unlock(&Giant);
657 kthread_exit(0);
658 }
659 mtx_lock(&sc->queue_mtx);
660 bp = bioq_takefirst(&sc->bio_queue);
661 if (!bp) {
662 msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
663 continue;
664 }
665 mtx_unlock(&sc->queue_mtx);
666 if (bp->bio_cmd == BIO_GETATTR) {
667 if (sc->fwsectors && sc->fwheads &&
668 (g_handleattr_int(bp, "GEOM::fwsectors",
669 sc->fwsectors) ||
670 g_handleattr_int(bp, "GEOM::fwheads",
671 sc->fwheads)))
672 error = -1;
673 else
674 error = EOPNOTSUPP;
675 } else {
676 error = sc->start(sc, bp);
677 }
678
679 if (error != -1) {
680 bp->bio_completed = bp->bio_length;
681 g_io_deliver(bp, error);
682 }
683 }
684 }
685
686 static struct md_s *
687 mdfind(int unit)
688 {
689 struct md_s *sc;
690
691 LIST_FOREACH(sc, &md_softc_list, list) {
692 if (sc->unit == unit)
693 break;
694 }
695 return (sc);
696 }
697
698 static struct md_s *
699 mdnew(int unit, int *errp, enum md_types type)
700 {
701 struct md_s *sc, *sc2;
702 int error, max = -1;
703
704 *errp = 0;
705 LIST_FOREACH(sc2, &md_softc_list, list) {
706 if (unit == sc2->unit) {
707 *errp = EBUSY;
708 return (NULL);
709 }
710 if (unit == -1 && sc2->unit > max)
711 max = sc2->unit;
712 }
713 if (unit == -1)
714 unit = max + 1;
715 sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
716 sc->type = type;
717 bioq_init(&sc->bio_queue);
718 mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
719 sc->unit = unit;
720 sprintf(sc->name, "md%d", unit);
721 LIST_INSERT_HEAD(&md_softc_list, sc, list);
722 error = kthread_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
723 if (error == 0)
724 return (sc);
725 LIST_REMOVE(sc, list);
726 mtx_destroy(&sc->queue_mtx);
727 free(sc, M_MD);
728 *errp = error;
729 return (NULL);
730 }
731
732 static void
733 mdinit(struct md_s *sc)
734 {
735
736 struct g_geom *gp;
737 struct g_provider *pp;
738
739 g_topology_lock();
740 gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
741 gp->softc = sc;
742 pp = g_new_providerf(gp, "md%d", sc->unit);
743 pp->mediasize = sc->mediasize;
744 pp->sectorsize = sc->sectorsize;
745 sc->gp = gp;
746 sc->pp = pp;
747 g_error_provider(pp, 0);
748 g_topology_unlock();
749 }
750
751 /*
752 * XXX: we should check that the range they feed us is mapped.
753 * XXX: we should implement read-only.
754 */
755
756 static int
757 mdcreate_preload(struct md_s *sc, struct md_ioctl *mdio)
758 {
759
760 if (mdio->md_options & ~(MD_AUTOUNIT | MD_FORCE))
761 return (EINVAL);
762 sc->flags = mdio->md_options & MD_FORCE;
763 /* Cast to pointer size, then to pointer to avoid warning */
764 sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
765 sc->pl_len = (size_t)sc->mediasize;
766 return (0);
767 }
768
769
770 static int
771 mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio)
772 {
773 uintptr_t sp;
774 int error;
775 off_t u;
776
777 error = 0;
778 if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
779 return (EINVAL);
780 if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize))
781 return (EINVAL);
782 /* Compression doesn't make sense if we have reserved space */
783 if (mdio->md_options & MD_RESERVE)
784 mdio->md_options &= ~MD_COMPRESS;
785 if (mdio->md_fwsectors != 0)
786 sc->fwsectors = mdio->md_fwsectors;
787 if (mdio->md_fwheads != 0)
788 sc->fwheads = mdio->md_fwheads;
789 sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
790 sc->indir = dimension(sc->mediasize / sc->sectorsize);
791 sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
792 0x1ff, 0);
793 if (mdio->md_options & MD_RESERVE) {
794 off_t nsectors;
795
796 nsectors = sc->mediasize / sc->sectorsize;
797 for (u = 0; u < nsectors; u++) {
798 sp = (uintptr_t)uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
799 if (sp != 0)
800 error = s_write(sc->indir, u, sp);
801 else
802 error = ENOMEM;
803 if (error != 0)
804 break;
805 }
806 }
807 return (error);
808 }
809
810
811 static int
812 mdsetcred(struct md_s *sc, struct ucred *cred)
813 {
814 char *tmpbuf;
815 int error = 0;
816
817 /*
818 * Set credits in our softc
819 */
820
821 if (sc->cred)
822 crfree(sc->cred);
823 sc->cred = crhold(cred);
824
825 /*
826 * Horrible kludge to establish credentials for NFS XXX.
827 */
828
829 if (sc->vnode) {
830 struct uio auio;
831 struct iovec aiov;
832
833 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
834 bzero(&auio, sizeof(auio));
835
836 aiov.iov_base = tmpbuf;
837 aiov.iov_len = sc->sectorsize;
838 auio.uio_iov = &aiov;
839 auio.uio_iovcnt = 1;
840 auio.uio_offset = 0;
841 auio.uio_rw = UIO_READ;
842 auio.uio_segflg = UIO_SYSSPACE;
843 auio.uio_resid = aiov.iov_len;
844 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
845 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
846 VOP_UNLOCK(sc->vnode, 0, curthread);
847 free(tmpbuf, M_TEMP);
848 }
849 return (error);
850 }
851
852 static int
853 mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
854 {
855 struct vattr vattr;
856 struct nameidata nd;
857 int error, flags;
858
859 error = copyinstr(mdio->md_file, sc->file, sizeof(sc->file), NULL);
860 if (error != 0)
861 return (error);
862 flags = FREAD|FWRITE;
863 /*
864 * If the user specified that this is a read only device, unset the
865 * FWRITE mask before trying to open the backing store.
866 */
867 if ((mdio->md_options & MD_READONLY) != 0)
868 flags &= ~FWRITE;
869 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
870 error = vn_open(&nd, &flags, 0, -1);
871 NDFREE(&nd, NDF_ONLY_PNBUF);
872 if (error != 0)
873 return (error);
874 if (nd.ni_vp->v_type != VREG ||
875 (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
876 VOP_UNLOCK(nd.ni_vp, 0, td);
877 (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
878 return (error ? error : EINVAL);
879 }
880 VOP_UNLOCK(nd.ni_vp, 0, td);
881
882 if (mdio->md_fwsectors != 0)
883 sc->fwsectors = mdio->md_fwsectors;
884 if (mdio->md_fwheads != 0)
885 sc->fwheads = mdio->md_fwheads;
886 sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
887 if (!(flags & FWRITE))
888 sc->flags |= MD_READONLY;
889 sc->vnode = nd.ni_vp;
890
891 error = mdsetcred(sc, td->td_ucred);
892 if (error != 0) {
893 (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
894 return (error);
895 }
896 return (0);
897 }
898
899 static int
900 mddestroy(struct md_s *sc, struct thread *td)
901 {
902
903
904 if (sc->gp) {
905 sc->gp->softc = NULL;
906 g_topology_lock();
907 g_wither_geom(sc->gp, ENXIO);
908 g_topology_unlock();
909 sc->gp = NULL;
910 sc->pp = NULL;
911 }
912 sc->flags |= MD_SHUTDOWN;
913 wakeup(sc);
914 while (sc->procp != NULL)
915 tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10);
916 mtx_destroy(&sc->queue_mtx);
917 if (sc->vnode != NULL) {
918 mtx_lock(&Giant);
919 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
920 FREAD : (FREAD|FWRITE), sc->cred, td);
921 mtx_unlock(&Giant);
922 }
923 if (sc->cred != NULL)
924 crfree(sc->cred);
925 if (sc->object != NULL)
926 vm_object_deallocate(sc->object);
927 if (sc->indir)
928 destroy_indir(sc, sc->indir);
929 if (sc->uma)
930 uma_zdestroy(sc->uma);
931
932 LIST_REMOVE(sc, list);
933 free(sc, M_MD);
934 return (0);
935 }
936
937 static int
938 mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
939 {
940 vm_ooffset_t npage;
941 int error;
942
943 /*
944 * Range check. Disallow negative sizes or any size less then the
945 * size of a page. Then round to a page.
946 */
947 if (sc->mediasize == 0 || (sc->mediasize % PAGE_SIZE) != 0)
948 return (EDOM);
949
950 /*
951 * Allocate an OBJT_SWAP object.
952 *
953 * Note the truncation.
954 */
955
956 npage = mdio->md_mediasize / PAGE_SIZE;
957 if (mdio->md_fwsectors != 0)
958 sc->fwsectors = mdio->md_fwsectors;
959 if (mdio->md_fwheads != 0)
960 sc->fwheads = mdio->md_fwheads;
961 sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
962 VM_PROT_DEFAULT, 0);
963 if (sc->object == NULL)
964 return (ENOMEM);
965 sc->flags = mdio->md_options & MD_FORCE;
966 if (mdio->md_options & MD_RESERVE) {
967 if (swap_pager_reserve(sc->object, 0, npage) < 0) {
968 vm_object_deallocate(sc->object);
969 sc->object = NULL;
970 return (EDOM);
971 }
972 }
973 error = mdsetcred(sc, td->td_ucred);
974 if (error != 0) {
975 vm_object_deallocate(sc->object);
976 sc->object = NULL;
977 }
978 return (error);
979 }
980
981
982 static int
983 xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
984 {
985 struct md_ioctl *mdio;
986 struct md_s *sc;
987 int error, i;
988
989 if (md_debug)
990 printf("mdctlioctl(%s %lx %p %x %p)\n",
991 devtoname(dev), cmd, addr, flags, td);
992
993 mdio = (struct md_ioctl *)addr;
994 if (mdio->md_version != MDIOVERSION)
995 return (EINVAL);
996
997 /*
998 * We assert the version number in the individual ioctl
999 * handlers instead of out here because (a) it is possible we
1000 * may add another ioctl in the future which doesn't read an
1001 * mdio, and (b) the correct return value for an unknown ioctl
1002 * is ENOIOCTL, not EINVAL.
1003 */
1004 error = 0;
1005 switch (cmd) {
1006 case MDIOCATTACH:
1007 switch (mdio->md_type) {
1008 case MD_MALLOC:
1009 case MD_PRELOAD:
1010 case MD_VNODE:
1011 case MD_SWAP:
1012 break;
1013 default:
1014 return (EINVAL);
1015 }
1016 if (mdio->md_options & MD_AUTOUNIT)
1017 sc = mdnew(-1, &error, mdio->md_type);
1018 else
1019 sc = mdnew(mdio->md_unit, &error, mdio->md_type);
1020 if (sc == NULL)
1021 return (error);
1022 if (mdio->md_options & MD_AUTOUNIT)
1023 mdio->md_unit = sc->unit;
1024 sc->mediasize = mdio->md_mediasize;
1025 if (mdio->md_sectorsize == 0)
1026 sc->sectorsize = DEV_BSIZE;
1027 else
1028 sc->sectorsize = mdio->md_sectorsize;
1029 error = EDOOFUS;
1030 switch (sc->type) {
1031 case MD_MALLOC:
1032 sc->start = mdstart_malloc;
1033 error = mdcreate_malloc(sc, mdio);
1034 break;
1035 case MD_PRELOAD:
1036 sc->start = mdstart_preload;
1037 error = mdcreate_preload(sc, mdio);
1038 break;
1039 case MD_VNODE:
1040 sc->start = mdstart_vnode;
1041 error = mdcreate_vnode(sc, mdio, td);
1042 break;
1043 case MD_SWAP:
1044 sc->start = mdstart_swap;
1045 error = mdcreate_swap(sc, mdio, td);
1046 break;
1047 }
1048 if (error != 0) {
1049 mddestroy(sc, td);
1050 return (error);
1051 }
1052
1053 /* Prune off any residual fractional sector */
1054 i = sc->mediasize % sc->sectorsize;
1055 sc->mediasize -= i;
1056
1057 mdinit(sc);
1058 return (0);
1059 case MDIOCDETACH:
1060 if (mdio->md_mediasize != 0 || mdio->md_options != 0)
1061 return (EINVAL);
1062
1063 sc = mdfind(mdio->md_unit);
1064 if (sc == NULL)
1065 return (ENOENT);
1066 if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
1067 return (EBUSY);
1068 return (mddestroy(sc, td));
1069 case MDIOCQUERY:
1070 sc = mdfind(mdio->md_unit);
1071 if (sc == NULL)
1072 return (ENOENT);
1073 mdio->md_type = sc->type;
1074 mdio->md_options = sc->flags;
1075 mdio->md_mediasize = sc->mediasize;
1076 mdio->md_sectorsize = sc->sectorsize;
1077 if (sc->type == MD_VNODE)
1078 error = copyout(sc->file, mdio->md_file,
1079 strlen(sc->file) + 1);
1080 return (error);
1081 case MDIOCLIST:
1082 i = 1;
1083 LIST_FOREACH(sc, &md_softc_list, list) {
1084 if (i == MDNPAD - 1)
1085 mdio->md_pad[i] = -1;
1086 else
1087 mdio->md_pad[i++] = sc->unit;
1088 }
1089 mdio->md_pad[0] = i - 1;
1090 return (0);
1091 default:
1092 return (ENOIOCTL);
1093 };
1094 }
1095
1096 static int
1097 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1098 {
1099 int error;
1100
1101 sx_xlock(&md_sx);
1102 error = xmdctlioctl(dev, cmd, addr, flags, td);
1103 sx_xunlock(&md_sx);
1104 return (error);
1105 }
1106
1107 static void
1108 md_preloaded(u_char *image, size_t length)
1109 {
1110 struct md_s *sc;
1111 int error;
1112
1113 sc = mdnew(-1, &error, MD_PRELOAD);
1114 if (sc == NULL)
1115 return;
1116 sc->mediasize = length;
1117 sc->sectorsize = DEV_BSIZE;
1118 sc->pl_ptr = image;
1119 sc->pl_len = length;
1120 sc->start = mdstart_preload;
1121 #ifdef MD_ROOT
1122 if (sc->unit == 0)
1123 rootdevnames[0] = "ufs:/dev/md0";
1124 #endif
1125 mdinit(sc);
1126 }
1127
1128 static void
1129 g_md_init(struct g_class *mp __unused)
1130 {
1131
1132 caddr_t mod;
1133 caddr_t c;
1134 u_char *ptr, *name, *type;
1135 unsigned len;
1136
1137 mod = NULL;
1138 sx_init(&md_sx, "MD config lock");
1139 g_topology_unlock();
1140 #ifdef MD_ROOT_SIZE
1141 sx_xlock(&md_sx);
1142 md_preloaded(mfs_root, MD_ROOT_SIZE * 1024);
1143 sx_xunlock(&md_sx);
1144 #endif
1145 /* XXX: are preload_* static or do they need Giant ? */
1146 while ((mod = preload_search_next_name(mod)) != NULL) {
1147 name = (char *)preload_search_info(mod, MODINFO_NAME);
1148 if (name == NULL)
1149 continue;
1150 type = (char *)preload_search_info(mod, MODINFO_TYPE);
1151 if (type == NULL)
1152 continue;
1153 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
1154 continue;
1155 c = preload_search_info(mod, MODINFO_ADDR);
1156 ptr = *(u_char **)c;
1157 c = preload_search_info(mod, MODINFO_SIZE);
1158 len = *(size_t *)c;
1159 printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
1160 MD_NAME, mdunits, name, len, ptr);
1161 sx_xlock(&md_sx);
1162 md_preloaded(ptr, len);
1163 sx_xunlock(&md_sx);
1164 }
1165 status_dev = make_dev(&mdctl_cdevsw, MAXMINOR, UID_ROOT, GID_WHEEL,
1166 0600, MDCTL_NAME);
1167 g_topology_lock();
1168 }
1169
1170 static void
1171 g_md_fini(struct g_class *mp __unused)
1172 {
1173
1174 sx_destroy(&md_sx);
1175 if (status_dev != NULL)
1176 destroy_dev(status_dev);
1177 }
Cache object: 412061acb907d5efe1124080e7bbf5ed
|