1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/bio.h>
34 #include <sys/endian.h>
35 #include <sys/kernel.h>
36 #include <sys/kobj.h>
37 #include <sys/limits.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/mutex.h>
41 #include <sys/sysctl.h>
42 #include <sys/systm.h>
43 #include <geom/geom.h>
44 #include <geom/geom_dbg.h>
45 #include "geom/raid/g_raid.h"
46 #include "g_raid_tr_if.h"
47
48 #define N 2
49
50 SYSCTL_DECL(_kern_geom_raid_raid1e);
51
52 #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
53 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
54 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
55 &g_raid1e_rebuild_slab, 0,
56 "Amount of the disk to rebuild each read/write cycle of the rebuild.");
57
58 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
59 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
60 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
61 &g_raid1e_rebuild_fair_io, 0,
62 "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
63
64 #define RAID1E_REBUILD_CLUSTER_IDLE 100
65 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
66 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
67 &g_raid1e_rebuild_cluster_idle, 0,
68 "Number of slabs to do each time we trigger a rebuild cycle");
69
70 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
71 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
72 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
73 &g_raid1e_rebuild_meta_update, 0,
74 "When to update the meta data.");
75
76 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
77
78 #define TR_RAID1E_NONE 0
79 #define TR_RAID1E_REBUILD 1
80 #define TR_RAID1E_RESYNC 2
81
82 #define TR_RAID1E_F_DOING_SOME 0x1
83 #define TR_RAID1E_F_LOCKED 0x2
84 #define TR_RAID1E_F_ABORT 0x4
85
86 struct g_raid_tr_raid1e_object {
87 struct g_raid_tr_object trso_base;
88 int trso_starting;
89 int trso_stopping;
90 int trso_type;
91 int trso_recover_slabs; /* slabs before rest */
92 int trso_fair_io;
93 int trso_meta_update;
94 int trso_flags;
95 struct g_raid_subdisk *trso_failed_sd; /* like per volume */
96 void *trso_buffer; /* Buffer space */
97 off_t trso_lock_pos; /* Locked range start. */
98 off_t trso_lock_len; /* Locked range length. */
99 struct bio trso_bio;
100 };
101
102 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
103 static g_raid_tr_event_t g_raid_tr_event_raid1e;
104 static g_raid_tr_start_t g_raid_tr_start_raid1e;
105 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
106 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
107 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
108 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
109 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
110 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
111 static g_raid_tr_free_t g_raid_tr_free_raid1e;
112
113 static kobj_method_t g_raid_tr_raid1e_methods[] = {
114 KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e),
115 KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e),
116 KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e),
117 KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e),
118 KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e),
119 KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e),
120 KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
121 KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e),
122 KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e),
123 KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e),
124 { 0, 0 }
125 };
126
127 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
128 "RAID1E",
129 g_raid_tr_raid1e_methods,
130 sizeof(struct g_raid_tr_raid1e_object),
131 .trc_enable = 1,
132 .trc_priority = 200,
133 .trc_accept_unmapped = 1
134 };
135
136 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
137 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
138 struct g_raid_subdisk *sd);
139 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
140 int no, off_t off, off_t len, u_int mask);
141
142 static inline void
143 V2P(struct g_raid_volume *vol, off_t virt,
144 int *disk, off_t *offset, off_t *start)
145 {
146 off_t nstrip;
147 u_int strip_size;
148
149 strip_size = vol->v_strip_size;
150 /* Strip number. */
151 nstrip = virt / strip_size;
152 /* Start position in strip. */
153 *start = virt % strip_size;
154 /* Disk number. */
155 *disk = (nstrip * N) % vol->v_disks_count;
156 /* Strip start position in disk. */
157 *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
158 }
159
160 static inline void
161 P2V(struct g_raid_volume *vol, int disk, off_t offset,
162 off_t *virt, int *copy)
163 {
164 off_t nstrip, start;
165 u_int strip_size;
166
167 strip_size = vol->v_strip_size;
168 /* Start position in strip. */
169 start = offset % strip_size;
170 /* Physical strip number. */
171 nstrip = (offset / strip_size) * vol->v_disks_count + disk;
172 /* Number of physical strip (copy) inside virtual strip. */
173 *copy = nstrip % N;
174 /* Offset in virtual space. */
175 *virt = (nstrip / N) * strip_size + start;
176 }
177
178 static int
179 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
180 {
181 struct g_raid_tr_raid1e_object *trs;
182
183 trs = (struct g_raid_tr_raid1e_object *)tr;
184 if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
185 tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
186 return (G_RAID_TR_TASTE_FAIL);
187 trs->trso_starting = 1;
188 return (G_RAID_TR_TASTE_SUCCEED);
189 }
190
191 static int
192 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
193 {
194 struct g_raid_softc *sc;
195 struct g_raid_subdisk *sd, *bestsd, *worstsd;
196 int i, j, state, sstate;
197
198 sc = vol->v_softc;
199 state = G_RAID_VOLUME_S_OPTIMAL;
200 for (i = 0; i < vol->v_disks_count / N; i++) {
201 bestsd = &vol->v_subdisks[i * N];
202 for (j = 1; j < N; j++) {
203 sd = &vol->v_subdisks[i * N + j];
204 if (sd->sd_state > bestsd->sd_state)
205 bestsd = sd;
206 else if (sd->sd_state == bestsd->sd_state &&
207 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
208 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
209 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
210 bestsd = sd;
211 }
212 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
213 bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
214 /* We found reasonable candidate. */
215 G_RAID_DEBUG1(1, sc,
216 "Promote subdisk %s:%d from %s to ACTIVE.",
217 vol->v_name, bestsd->sd_pos,
218 g_raid_subdisk_state2str(bestsd->sd_state));
219 g_raid_change_subdisk_state(bestsd,
220 G_RAID_SUBDISK_S_ACTIVE);
221 g_raid_write_metadata(sc,
222 vol, bestsd, bestsd->sd_disk);
223 }
224 worstsd = &vol->v_subdisks[i * N];
225 for (j = 1; j < N; j++) {
226 sd = &vol->v_subdisks[i * N + j];
227 if (sd->sd_state < worstsd->sd_state)
228 worstsd = sd;
229 }
230 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
231 sstate = G_RAID_VOLUME_S_OPTIMAL;
232 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
233 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
234 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
235 sstate = G_RAID_VOLUME_S_DEGRADED;
236 else
237 sstate = G_RAID_VOLUME_S_BROKEN;
238 if (sstate < state)
239 state = sstate;
240 }
241 return (state);
242 }
243
244 static int
245 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
246 {
247 struct g_raid_softc *sc;
248 struct g_raid_subdisk *sd, *bestsd, *worstsd;
249 int i, j, state, sstate;
250
251 sc = vol->v_softc;
252 if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
253 vol->v_disks_count)
254 return (G_RAID_VOLUME_S_OPTIMAL);
255 for (i = 0; i < vol->v_disks_count; i++) {
256 sd = &vol->v_subdisks[i];
257 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
258 /* We found reasonable candidate. */
259 G_RAID_DEBUG1(1, sc,
260 "Promote subdisk %s:%d from %s to STALE.",
261 vol->v_name, sd->sd_pos,
262 g_raid_subdisk_state2str(sd->sd_state));
263 g_raid_change_subdisk_state(sd,
264 G_RAID_SUBDISK_S_STALE);
265 g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
266 }
267 }
268 state = G_RAID_VOLUME_S_OPTIMAL;
269 for (i = 0; i < vol->v_disks_count; i++) {
270 bestsd = &vol->v_subdisks[i];
271 worstsd = &vol->v_subdisks[i];
272 for (j = 1; j < N; j++) {
273 sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
274 if (sd->sd_state > bestsd->sd_state)
275 bestsd = sd;
276 else if (sd->sd_state == bestsd->sd_state &&
277 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
278 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
279 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
280 bestsd = sd;
281 if (sd->sd_state < worstsd->sd_state)
282 worstsd = sd;
283 }
284 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
285 sstate = G_RAID_VOLUME_S_OPTIMAL;
286 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
287 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
288 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
289 sstate = G_RAID_VOLUME_S_DEGRADED;
290 else
291 sstate = G_RAID_VOLUME_S_BROKEN;
292 if (sstate < state)
293 state = sstate;
294 }
295 return (state);
296 }
297
298 static int
299 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
300 struct g_raid_subdisk *sd)
301 {
302 struct g_raid_tr_raid1e_object *trs;
303 struct g_raid_softc *sc;
304 u_int s;
305
306 sc = vol->v_softc;
307 trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
308 if (trs->trso_stopping &&
309 (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
310 s = G_RAID_VOLUME_S_STOPPED;
311 else if (trs->trso_starting)
312 s = G_RAID_VOLUME_S_STARTING;
313 else {
314 if ((vol->v_disks_count % N) == 0)
315 s = g_raid_tr_update_state_raid1e_even(vol);
316 else
317 s = g_raid_tr_update_state_raid1e_odd(vol);
318 }
319 if (s != vol->v_state) {
320 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
321 G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
322 G_RAID_EVENT_VOLUME);
323 g_raid_change_volume_state(vol, s);
324 if (!trs->trso_starting && !trs->trso_stopping)
325 g_raid_write_metadata(sc, vol, NULL, NULL);
326 }
327 if (!trs->trso_starting && !trs->trso_stopping)
328 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
329 return (0);
330 }
331
332 static void
333 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
334 struct g_raid_disk *disk)
335 {
336 struct g_raid_volume *vol;
337
338 vol = sd->sd_volume;
339 /*
340 * We don't fail the last disk in the pack, since it still has decent
341 * data on it and that's better than failing the disk if it is the root
342 * file system.
343 *
344 * XXX should this be controlled via a tunable? It makes sense for
345 * the volume that has / on it. I can't think of a case where we'd
346 * want the volume to go away on this kind of event.
347 */
348 if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
349 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
350 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
351 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
352 vol->v_disks_count) &&
353 (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
354 return;
355 g_raid_fail_disk(sc, sd, disk);
356 }
357
358 static void
359 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
360 {
361 struct g_raid_volume *vol;
362 struct g_raid_subdisk *sd;
363
364 vol = trs->trso_base.tro_volume;
365 sd = trs->trso_failed_sd;
366 g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
367 free(trs->trso_buffer, M_TR_RAID1E);
368 trs->trso_buffer = NULL;
369 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
370 trs->trso_type = TR_RAID1E_NONE;
371 trs->trso_recover_slabs = 0;
372 trs->trso_failed_sd = NULL;
373 g_raid_tr_update_state_raid1e(vol, NULL);
374 }
375
376 static void
377 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
378 {
379 struct g_raid_tr_raid1e_object *trs;
380 struct g_raid_subdisk *sd;
381
382 trs = (struct g_raid_tr_raid1e_object *)tr;
383 sd = trs->trso_failed_sd;
384 G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
385 "Subdisk %s:%d-%s rebuild completed.",
386 sd->sd_volume->v_name, sd->sd_pos,
387 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
388 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
389 sd->sd_rebuild_pos = 0;
390 g_raid_tr_raid1e_rebuild_done(trs);
391 }
392
393 static void
394 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
395 {
396 struct g_raid_tr_raid1e_object *trs;
397 struct g_raid_subdisk *sd;
398 struct g_raid_volume *vol;
399
400 vol = tr->tro_volume;
401 trs = (struct g_raid_tr_raid1e_object *)tr;
402 sd = trs->trso_failed_sd;
403 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
404 G_RAID_DEBUG1(1, vol->v_softc,
405 "Subdisk %s:%d-%s rebuild is aborting.",
406 sd->sd_volume->v_name, sd->sd_pos,
407 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
408 trs->trso_flags |= TR_RAID1E_F_ABORT;
409 } else {
410 G_RAID_DEBUG1(0, vol->v_softc,
411 "Subdisk %s:%d-%s rebuild aborted.",
412 sd->sd_volume->v_name, sd->sd_pos,
413 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
414 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
415 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
416 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
417 g_raid_unlock_range(tr->tro_volume,
418 trs->trso_lock_pos, trs->trso_lock_len);
419 }
420 g_raid_tr_raid1e_rebuild_done(trs);
421 }
422 }
423
424 static void
425 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
426 {
427 struct g_raid_tr_raid1e_object *trs;
428 struct g_raid_softc *sc;
429 struct g_raid_volume *vol;
430 struct g_raid_subdisk *sd;
431 struct bio *bp;
432 off_t len, virtual, vend, offset, start;
433 int disk, copy, best;
434
435 trs = (struct g_raid_tr_raid1e_object *)tr;
436 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
437 return;
438 vol = tr->tro_volume;
439 sc = vol->v_softc;
440 sd = trs->trso_failed_sd;
441
442 while (1) {
443 if (sd->sd_rebuild_pos >= sd->sd_size) {
444 g_raid_tr_raid1e_rebuild_finish(tr);
445 return;
446 }
447 /* Get virtual offset from physical rebuild position. */
448 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©);
449 /* Get physical offset back to get first stripe position. */
450 V2P(vol, virtual, &disk, &offset, &start);
451 /* Calculate contignous data length. */
452 len = MIN(g_raid1e_rebuild_slab,
453 sd->sd_size - sd->sd_rebuild_pos);
454 if ((vol->v_disks_count % N) != 0)
455 len = MIN(len, vol->v_strip_size - start);
456 /* Find disk with most accurate data. */
457 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
458 offset + start, len, 0);
459 if (best < 0) {
460 /* There is no any valid disk. */
461 g_raid_tr_raid1e_rebuild_abort(tr);
462 return;
463 } else if (best != copy) {
464 /* Some other disk has better data. */
465 break;
466 }
467 /* We have the most accurate data. Skip the range. */
468 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
469 sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
470 sd->sd_rebuild_pos += len;
471 }
472
473 bp = &trs->trso_bio;
474 memset(bp, 0, sizeof(*bp));
475 bp->bio_offset = offset + start +
476 ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
477 bp->bio_length = len;
478 bp->bio_data = trs->trso_buffer;
479 bp->bio_cmd = BIO_READ;
480 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
481 bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
482 G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
483 /*
484 * If we are crossing stripe boundary, correct affected virtual
485 * range we should lock.
486 */
487 if (start + len > vol->v_strip_size) {
488 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©);
489 len = vend - virtual;
490 }
491 trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
492 trs->trso_flags |= TR_RAID1E_F_LOCKED;
493 trs->trso_lock_pos = virtual;
494 trs->trso_lock_len = len;
495 /* Lock callback starts I/O */
496 g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
497 }
498
499 static void
500 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
501 {
502 struct g_raid_volume *vol;
503 struct g_raid_tr_raid1e_object *trs;
504 struct g_raid_subdisk *sd;
505
506 vol = tr->tro_volume;
507 trs = (struct g_raid_tr_raid1e_object *)tr;
508 if (trs->trso_failed_sd) {
509 G_RAID_DEBUG1(1, vol->v_softc,
510 "Already rebuild in start rebuild. pos %jd\n",
511 (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
512 return;
513 }
514 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
515 if (sd == NULL)
516 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
517 if (sd == NULL) {
518 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
519 if (sd != NULL) {
520 sd->sd_rebuild_pos = 0;
521 g_raid_change_subdisk_state(sd,
522 G_RAID_SUBDISK_S_RESYNC);
523 g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
524 } else {
525 sd = g_raid_get_subdisk(vol,
526 G_RAID_SUBDISK_S_UNINITIALIZED);
527 if (sd == NULL)
528 sd = g_raid_get_subdisk(vol,
529 G_RAID_SUBDISK_S_NEW);
530 if (sd != NULL) {
531 sd->sd_rebuild_pos = 0;
532 g_raid_change_subdisk_state(sd,
533 G_RAID_SUBDISK_S_REBUILD);
534 g_raid_write_metadata(vol->v_softc,
535 vol, sd, NULL);
536 }
537 }
538 }
539 if (sd == NULL) {
540 G_RAID_DEBUG1(1, vol->v_softc,
541 "No failed disk to rebuild. night night.");
542 return;
543 }
544 trs->trso_failed_sd = sd;
545 G_RAID_DEBUG1(0, vol->v_softc,
546 "Subdisk %s:%d-%s rebuild start at %jd.",
547 sd->sd_volume->v_name, sd->sd_pos,
548 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
549 trs->trso_failed_sd->sd_rebuild_pos);
550 trs->trso_type = TR_RAID1E_REBUILD;
551 trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
552 trs->trso_meta_update = g_raid1e_rebuild_meta_update;
553 g_raid_tr_raid1e_rebuild_some(tr);
554 }
555
556 static void
557 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
558 struct g_raid_subdisk *sd)
559 {
560 struct g_raid_volume *vol;
561 struct g_raid_tr_raid1e_object *trs;
562 int nr;
563
564 vol = tr->tro_volume;
565 trs = (struct g_raid_tr_raid1e_object *)tr;
566 if (trs->trso_stopping)
567 return;
568 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
569 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
570 switch(trs->trso_type) {
571 case TR_RAID1E_NONE:
572 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
573 return;
574 if (nr == 0) {
575 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
576 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
577 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
578 if (nr == 0)
579 return;
580 }
581 g_raid_tr_raid1e_rebuild_start(tr);
582 break;
583 case TR_RAID1E_REBUILD:
584 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
585 trs->trso_failed_sd == sd)
586 g_raid_tr_raid1e_rebuild_abort(tr);
587 break;
588 case TR_RAID1E_RESYNC:
589 break;
590 }
591 }
592
593 static int
594 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
595 struct g_raid_subdisk *sd, u_int event)
596 {
597
598 g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
599 return (0);
600 }
601
602 static int
603 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
604 {
605 struct g_raid_tr_raid1e_object *trs;
606 struct g_raid_volume *vol;
607
608 trs = (struct g_raid_tr_raid1e_object *)tr;
609 vol = tr->tro_volume;
610 trs->trso_starting = 0;
611 g_raid_tr_update_state_raid1e(vol, NULL);
612 return (0);
613 }
614
615 static int
616 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
617 {
618 struct g_raid_tr_raid1e_object *trs;
619 struct g_raid_volume *vol;
620
621 trs = (struct g_raid_tr_raid1e_object *)tr;
622 vol = tr->tro_volume;
623 trs->trso_starting = 0;
624 trs->trso_stopping = 1;
625 g_raid_tr_update_state_raid1e(vol, NULL);
626 return (0);
627 }
628
629 /*
630 * Select the disk to read from. Take into account: subdisk state, running
631 * error recovery, average disk load, head position and possible cache hits.
632 */
633 #define ABS(x) (((x) >= 0) ? (x) : (-(x)))
634 static int
635 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
636 int no, off_t off, off_t len, u_int mask)
637 {
638 struct g_raid_subdisk *sd;
639 off_t offset;
640 int i, best, prio, bestprio;
641
642 best = -1;
643 bestprio = INT_MAX;
644 for (i = 0; i < N; i++) {
645 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
646 offset = off;
647 if (no + i >= vol->v_disks_count)
648 offset += vol->v_strip_size;
649
650 prio = G_RAID_SUBDISK_LOAD(sd);
651 if ((mask & (1 << sd->sd_pos)) != 0)
652 continue;
653 switch (sd->sd_state) {
654 case G_RAID_SUBDISK_S_ACTIVE:
655 break;
656 case G_RAID_SUBDISK_S_RESYNC:
657 if (offset + off < sd->sd_rebuild_pos)
658 break;
659 /* FALLTHROUGH */
660 case G_RAID_SUBDISK_S_STALE:
661 prio += i << 24;
662 break;
663 case G_RAID_SUBDISK_S_REBUILD:
664 if (offset + off < sd->sd_rebuild_pos)
665 break;
666 /* FALLTHROUGH */
667 default:
668 continue;
669 }
670 prio += min(sd->sd_recovery, 255) << 16;
671 /* If disk head is precisely in position - highly prefer it. */
672 if (G_RAID_SUBDISK_POS(sd) == offset)
673 prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
674 else
675 /* If disk head is close to position - prefer it. */
676 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
677 G_RAID_SUBDISK_TRACK_SIZE)
678 prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
679 if (prio < bestprio) {
680 bestprio = prio;
681 best = i;
682 }
683 }
684 return (best);
685 }
686
687 static void
688 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
689 {
690 struct g_raid_volume *vol;
691 struct g_raid_subdisk *sd;
692 struct bio_queue_head queue;
693 struct bio *cbp;
694 char *addr;
695 off_t offset, start, length, remain;
696 u_int no, strip_size;
697 int best;
698
699 vol = tr->tro_volume;
700 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
701 addr = NULL;
702 else
703 addr = bp->bio_data;
704 strip_size = vol->v_strip_size;
705 V2P(vol, bp->bio_offset, &no, &offset, &start);
706 remain = bp->bio_length;
707 bioq_init(&queue);
708 while (remain > 0) {
709 length = MIN(strip_size - start, remain);
710 best = g_raid_tr_raid1e_select_read_disk(vol,
711 no, offset, length, 0);
712 KASSERT(best >= 0, ("No readable disk in volume %s!",
713 vol->v_name));
714 no += best;
715 if (no >= vol->v_disks_count) {
716 no -= vol->v_disks_count;
717 offset += strip_size;
718 }
719 cbp = g_clone_bio(bp);
720 if (cbp == NULL)
721 goto failure;
722 cbp->bio_offset = offset + start;
723 cbp->bio_length = length;
724 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
725 cbp->bio_ma_offset += (uintptr_t)addr;
726 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
727 cbp->bio_ma_offset %= PAGE_SIZE;
728 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
729 cbp->bio_length) / PAGE_SIZE;
730 } else
731 cbp->bio_data = addr;
732 cbp->bio_caller1 = &vol->v_subdisks[no];
733 bioq_insert_tail(&queue, cbp);
734 no += N - best;
735 if (no >= vol->v_disks_count) {
736 no -= vol->v_disks_count;
737 offset += strip_size;
738 }
739 remain -= length;
740 addr += length;
741 start = 0;
742 }
743 while ((cbp = bioq_takefirst(&queue)) != NULL) {
744 sd = cbp->bio_caller1;
745 cbp->bio_caller1 = NULL;
746 g_raid_subdisk_iostart(sd, cbp);
747 }
748 return;
749 failure:
750 while ((cbp = bioq_takefirst(&queue)) != NULL)
751 g_destroy_bio(cbp);
752 if (bp->bio_error == 0)
753 bp->bio_error = ENOMEM;
754 g_raid_iodone(bp, bp->bio_error);
755 }
756
757 static void
758 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
759 {
760 struct g_raid_volume *vol;
761 struct g_raid_subdisk *sd;
762 struct bio_queue_head queue;
763 struct bio *cbp;
764 char *addr;
765 off_t offset, start, length, remain;
766 u_int no, strip_size;
767 int i;
768
769 vol = tr->tro_volume;
770 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
771 addr = NULL;
772 else
773 addr = bp->bio_data;
774 strip_size = vol->v_strip_size;
775 V2P(vol, bp->bio_offset, &no, &offset, &start);
776 remain = bp->bio_length;
777 bioq_init(&queue);
778 while (remain > 0) {
779 length = MIN(strip_size - start, remain);
780 for (i = 0; i < N; i++) {
781 sd = &vol->v_subdisks[no];
782 switch (sd->sd_state) {
783 case G_RAID_SUBDISK_S_ACTIVE:
784 case G_RAID_SUBDISK_S_STALE:
785 case G_RAID_SUBDISK_S_RESYNC:
786 break;
787 case G_RAID_SUBDISK_S_REBUILD:
788 if (offset + start >= sd->sd_rebuild_pos)
789 goto nextdisk;
790 break;
791 default:
792 goto nextdisk;
793 }
794 cbp = g_clone_bio(bp);
795 if (cbp == NULL)
796 goto failure;
797 cbp->bio_offset = offset + start;
798 cbp->bio_length = length;
799 if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
800 bp->bio_cmd != BIO_DELETE) {
801 cbp->bio_ma_offset += (uintptr_t)addr;
802 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
803 cbp->bio_ma_offset %= PAGE_SIZE;
804 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
805 cbp->bio_length) / PAGE_SIZE;
806 } else
807 cbp->bio_data = addr;
808 cbp->bio_caller1 = sd;
809 bioq_insert_tail(&queue, cbp);
810 nextdisk:
811 if (++no >= vol->v_disks_count) {
812 no = 0;
813 offset += strip_size;
814 }
815 }
816 remain -= length;
817 if (bp->bio_cmd != BIO_DELETE)
818 addr += length;
819 start = 0;
820 }
821 while ((cbp = bioq_takefirst(&queue)) != NULL) {
822 sd = cbp->bio_caller1;
823 cbp->bio_caller1 = NULL;
824 g_raid_subdisk_iostart(sd, cbp);
825 }
826 return;
827 failure:
828 while ((cbp = bioq_takefirst(&queue)) != NULL)
829 g_destroy_bio(cbp);
830 if (bp->bio_error == 0)
831 bp->bio_error = ENOMEM;
832 g_raid_iodone(bp, bp->bio_error);
833 }
834
835 static void
836 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
837 {
838 struct g_raid_volume *vol;
839 struct g_raid_tr_raid1e_object *trs;
840
841 vol = tr->tro_volume;
842 trs = (struct g_raid_tr_raid1e_object *)tr;
843 if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
844 vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
845 vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
846 g_raid_iodone(bp, EIO);
847 return;
848 }
849 /*
850 * If we're rebuilding, squeeze in rebuild activity every so often,
851 * even when the disk is busy. Be sure to only count real I/O
852 * to the disk. All 'SPECIAL' I/O is traffic generated to the disk
853 * by this module.
854 */
855 if (trs->trso_failed_sd != NULL &&
856 !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
857 /* Make this new or running now round short. */
858 trs->trso_recover_slabs = 0;
859 if (--trs->trso_fair_io <= 0) {
860 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
861 g_raid_tr_raid1e_rebuild_some(tr);
862 }
863 }
864 switch (bp->bio_cmd) {
865 case BIO_READ:
866 g_raid_tr_iostart_raid1e_read(tr, bp);
867 break;
868 case BIO_WRITE:
869 case BIO_DELETE:
870 g_raid_tr_iostart_raid1e_write(tr, bp);
871 break;
872 case BIO_SPEEDUP:
873 case BIO_FLUSH:
874 g_raid_tr_flush_common(tr, bp);
875 break;
876 default:
877 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
878 bp->bio_cmd, vol->v_name));
879 break;
880 }
881 }
882
883 static void
884 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
885 struct g_raid_subdisk *sd, struct bio *bp)
886 {
887 struct bio *cbp;
888 struct g_raid_subdisk *nsd;
889 struct g_raid_volume *vol;
890 struct bio *pbp;
891 struct g_raid_tr_raid1e_object *trs;
892 off_t virtual, offset, start;
893 uintptr_t mask;
894 int error, do_write, copy, disk, best;
895
896 trs = (struct g_raid_tr_raid1e_object *)tr;
897 vol = tr->tro_volume;
898 if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
899 if (trs->trso_type == TR_RAID1E_REBUILD) {
900 nsd = trs->trso_failed_sd;
901 if (bp->bio_cmd == BIO_READ) {
902 /* Immediately abort rebuild, if requested. */
903 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
904 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
905 g_raid_tr_raid1e_rebuild_abort(tr);
906 return;
907 }
908
909 /* On read error, skip and cross fingers. */
910 if (bp->bio_error != 0) {
911 G_RAID_LOGREQ(0, bp,
912 "Read error during rebuild (%d), "
913 "possible data loss!",
914 bp->bio_error);
915 goto rebuild_round_done;
916 }
917
918 /*
919 * The read operation finished, queue the
920 * write and get out.
921 */
922 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
923 bp->bio_error);
924 bp->bio_cmd = BIO_WRITE;
925 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
926 bp->bio_offset = nsd->sd_rebuild_pos;
927 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
928 g_raid_subdisk_iostart(nsd, bp);
929 } else {
930 /*
931 * The write operation just finished. Do
932 * another. We keep cloning the master bio
933 * since it has the right buffers allocated to
934 * it.
935 */
936 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
937 bp->bio_error);
938 if (bp->bio_error != 0 ||
939 trs->trso_flags & TR_RAID1E_F_ABORT) {
940 if ((trs->trso_flags &
941 TR_RAID1E_F_ABORT) == 0) {
942 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
943 nsd, nsd->sd_disk);
944 }
945 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
946 g_raid_tr_raid1e_rebuild_abort(tr);
947 return;
948 }
949 rebuild_round_done:
950 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
951 g_raid_unlock_range(tr->tro_volume,
952 trs->trso_lock_pos, trs->trso_lock_len);
953 nsd->sd_rebuild_pos += bp->bio_length;
954 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
955 g_raid_tr_raid1e_rebuild_finish(tr);
956 return;
957 }
958
959 /* Abort rebuild if we are stopping */
960 if (trs->trso_stopping) {
961 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
962 g_raid_tr_raid1e_rebuild_abort(tr);
963 return;
964 }
965
966 if (--trs->trso_meta_update <= 0) {
967 g_raid_write_metadata(vol->v_softc,
968 vol, nsd, nsd->sd_disk);
969 trs->trso_meta_update =
970 g_raid1e_rebuild_meta_update;
971 /* Compensate short rebuild I/Os. */
972 if ((vol->v_disks_count % N) != 0 &&
973 vol->v_strip_size <
974 g_raid1e_rebuild_slab) {
975 trs->trso_meta_update *=
976 g_raid1e_rebuild_slab;
977 trs->trso_meta_update /=
978 vol->v_strip_size;
979 }
980 }
981 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
982 if (--trs->trso_recover_slabs <= 0)
983 return;
984 /* Run next rebuild iteration. */
985 g_raid_tr_raid1e_rebuild_some(tr);
986 }
987 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
988 /*
989 * read good sd, read bad sd in parallel. when both
990 * done, compare the buffers. write good to the bad
991 * if different. do the next bit of work.
992 */
993 panic("Somehow, we think we're doing a resync");
994 }
995 return;
996 }
997 pbp = bp->bio_parent;
998 pbp->bio_inbed++;
999 mask = (intptr_t)bp->bio_caller2;
1000 if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1001 /*
1002 * Read failed on first drive. Retry the read error on
1003 * another disk drive, if available, before erroring out the
1004 * read.
1005 */
1006 sd->sd_disk->d_read_errs++;
1007 G_RAID_LOGREQ(0, bp,
1008 "Read error (%d), %d read errors total",
1009 bp->bio_error, sd->sd_disk->d_read_errs);
1010
1011 /*
1012 * If there are too many read errors, we move to degraded.
1013 * XXX Do we want to FAIL the drive (eg, make the user redo
1014 * everything to get it back in sync), or just degrade the
1015 * drive, which kicks off a resync?
1016 */
1017 do_write = 0;
1018 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1019 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1020 else if (mask == 0)
1021 do_write = 1;
1022
1023 /* Restore what we were doing. */
1024 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1025 V2P(vol, virtual, &disk, &offset, &start);
1026
1027 /* Find the other disk, and try to do the I/O to it. */
1028 mask |= 1 << copy;
1029 best = g_raid_tr_raid1e_select_read_disk(vol,
1030 disk, offset, start, mask);
1031 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1032 disk += best;
1033 if (disk >= vol->v_disks_count) {
1034 disk -= vol->v_disks_count;
1035 offset += vol->v_strip_size;
1036 }
1037 cbp->bio_offset = offset + start;
1038 cbp->bio_length = bp->bio_length;
1039 cbp->bio_data = bp->bio_data;
1040 cbp->bio_ma = bp->bio_ma;
1041 cbp->bio_ma_offset = bp->bio_ma_offset;
1042 cbp->bio_ma_n = bp->bio_ma_n;
1043 g_destroy_bio(bp);
1044 nsd = &vol->v_subdisks[disk];
1045 G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1046 nsd->sd_pos);
1047 if (do_write)
1048 mask |= 1 << 31;
1049 if ((mask & (1U << 31)) != 0)
1050 sd->sd_recovery++;
1051 cbp->bio_caller2 = (void *)mask;
1052 if (do_write) {
1053 cbp->bio_caller1 = nsd;
1054 /* Lock callback starts I/O */
1055 g_raid_lock_range(sd->sd_volume,
1056 virtual, cbp->bio_length, pbp, cbp);
1057 } else {
1058 g_raid_subdisk_iostart(nsd, cbp);
1059 }
1060 return;
1061 }
1062 /*
1063 * We can't retry. Return the original error by falling
1064 * through. This will happen when there's only one good disk.
1065 * We don't need to fail the raid, since its actual state is
1066 * based on the state of the subdisks.
1067 */
1068 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1069 }
1070 if (bp->bio_cmd == BIO_READ &&
1071 bp->bio_error == 0 &&
1072 (mask & (1U << 31)) != 0) {
1073 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1074
1075 /* Restore what we were doing. */
1076 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1077 V2P(vol, virtual, &disk, &offset, &start);
1078
1079 /* Find best disk to write. */
1080 best = g_raid_tr_raid1e_select_read_disk(vol,
1081 disk, offset, start, ~mask);
1082 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1083 disk += best;
1084 if (disk >= vol->v_disks_count) {
1085 disk -= vol->v_disks_count;
1086 offset += vol->v_strip_size;
1087 }
1088 cbp->bio_offset = offset + start;
1089 cbp->bio_cmd = BIO_WRITE;
1090 cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1091 cbp->bio_caller2 = (void *)mask;
1092 g_destroy_bio(bp);
1093 G_RAID_LOGREQ(2, cbp,
1094 "Attempting bad sector remap on failing drive.");
1095 g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1096 return;
1097 }
1098 }
1099 if ((mask & (1U << 31)) != 0) {
1100 /*
1101 * We're done with a recovery, mark the range as unlocked.
1102 * For any write errors, we aggressively fail the disk since
1103 * there was both a READ and a WRITE error at this location.
1104 * Both types of errors generally indicates the drive is on
1105 * the verge of total failure anyway. Better to stop trusting
1106 * it now. However, we need to reset error to 0 in that case
1107 * because we're not failing the original I/O which succeeded.
1108 */
1109
1110 /* Restore what we were doing. */
1111 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1112 V2P(vol, virtual, &disk, &offset, &start);
1113
1114 for (copy = 0; copy < N; copy++) {
1115 if ((mask & (1 << copy) ) != 0)
1116 vol->v_subdisks[(disk + copy) %
1117 vol->v_disks_count].sd_recovery--;
1118 }
1119
1120 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1121 G_RAID_LOGREQ(0, bp, "Remap write failed: "
1122 "failing subdisk.");
1123 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1124 bp->bio_error = 0;
1125 }
1126 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1127 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1128 }
1129 if (pbp->bio_cmd != BIO_READ) {
1130 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1131 pbp->bio_error = bp->bio_error;
1132 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1133 G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1134 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1135 }
1136 error = pbp->bio_error;
1137 } else
1138 error = bp->bio_error;
1139 g_destroy_bio(bp);
1140 if (pbp->bio_children == pbp->bio_inbed) {
1141 pbp->bio_completed = pbp->bio_length;
1142 g_raid_iodone(pbp, error);
1143 }
1144 }
1145
1146 static int
1147 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
1148 off_t boffset, size_t blength)
1149 {
1150 struct g_raid_volume *vol;
1151 struct g_raid_subdisk *sd;
1152 struct bio_queue_head queue;
1153 char *addr;
1154 off_t offset, start, length, remain;
1155 u_int no, strip_size;
1156 int i, error;
1157
1158 vol = tr->tro_volume;
1159 addr = virtual;
1160 strip_size = vol->v_strip_size;
1161 V2P(vol, boffset, &no, &offset, &start);
1162 remain = blength;
1163 bioq_init(&queue);
1164 while (remain > 0) {
1165 length = MIN(strip_size - start, remain);
1166 for (i = 0; i < N; i++) {
1167 sd = &vol->v_subdisks[no];
1168 switch (sd->sd_state) {
1169 case G_RAID_SUBDISK_S_ACTIVE:
1170 case G_RAID_SUBDISK_S_STALE:
1171 case G_RAID_SUBDISK_S_RESYNC:
1172 break;
1173 case G_RAID_SUBDISK_S_REBUILD:
1174 if (offset + start >= sd->sd_rebuild_pos)
1175 goto nextdisk;
1176 break;
1177 default:
1178 goto nextdisk;
1179 }
1180 error = g_raid_subdisk_kerneldump(sd, addr,
1181 offset + start, length);
1182 if (error != 0)
1183 return (error);
1184 nextdisk:
1185 if (++no >= vol->v_disks_count) {
1186 no = 0;
1187 offset += strip_size;
1188 }
1189 }
1190 remain -= length;
1191 addr += length;
1192 start = 0;
1193 }
1194 return (0);
1195 }
1196
1197 static int
1198 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1199 {
1200 struct bio *bp;
1201 struct g_raid_subdisk *sd;
1202
1203 bp = (struct bio *)argp;
1204 sd = (struct g_raid_subdisk *)bp->bio_caller1;
1205 g_raid_subdisk_iostart(sd, bp);
1206
1207 return (0);
1208 }
1209
1210 static int
1211 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1212 {
1213 struct g_raid_tr_raid1e_object *trs;
1214 struct g_raid_volume *vol;
1215
1216 vol = tr->tro_volume;
1217 trs = (struct g_raid_tr_raid1e_object *)tr;
1218 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1219 trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1220 /* Compensate short rebuild I/Os. */
1221 if ((vol->v_disks_count % N) != 0 &&
1222 vol->v_strip_size < g_raid1e_rebuild_slab) {
1223 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1224 trs->trso_recover_slabs /= vol->v_strip_size;
1225 }
1226 if (trs->trso_type == TR_RAID1E_REBUILD)
1227 g_raid_tr_raid1e_rebuild_some(tr);
1228 return (0);
1229 }
1230
1231 static int
1232 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1233 {
1234 struct g_raid_tr_raid1e_object *trs;
1235
1236 trs = (struct g_raid_tr_raid1e_object *)tr;
1237
1238 if (trs->trso_buffer != NULL) {
1239 free(trs->trso_buffer, M_TR_RAID1E);
1240 trs->trso_buffer = NULL;
1241 }
1242 return (0);
1243 }
1244
1245 G_RAID_TR_DECLARE(raid1e, "RAID1E");
Cache object: 40a82672572b109e7dffb0b96608b93e
|