1 /*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 /*
36 * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
37 *
38 * See also hammer_undo.c
39 */
40
41 #include "hammer.h"
42
43 RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
44 hammer_redo_rb_compare, hammer_off_t, redo_fifo_start);
45
46 /*
47 * HAMMER version 4+ REDO support.
48 *
49 * REDO records are used to improve fsync() performance. Instead of having
50 * to go through a complete double-flush cycle involving at least two disk
51 * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
52 * the related REDO records, which is a single synchronization requiring
53 * no track seeking. If a recovery becomes necessary the recovery code
54 * will generate logical data writes based on the REDO records encountered.
55 * That is, the recovery code will UNDO any partial meta-data/data writes
56 * at the raw disk block level and then REDO the data writes at the logical
57 * level.
58 */
59 int
60 hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
61 hammer_off_t file_off, u_int32_t flags,
62 void *base, int len)
63 {
64 hammer_mount_t hmp;
65 hammer_volume_t root_volume;
66 hammer_blockmap_t undomap;
67 hammer_buffer_t buffer = NULL;
68 hammer_fifo_redo_t redo;
69 hammer_fifo_tail_t tail;
70 hammer_off_t next_offset;
71 int error;
72 int bytes;
73 int n;
74
75 /*
76 * Setup
77 */
78 hmp = trans->hmp;
79
80 root_volume = trans->rootvol;
81 undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
82
83 /*
84 * No undo recursion when modifying the root volume
85 */
86 hammer_modify_volume(NULL, root_volume, NULL, 0);
87 hammer_lock_ex(&hmp->undo_lock);
88
89 /* undo had better not roll over (loose test) */
90 if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
91 panic("hammer: insufficient undo FIFO space!");
92
93 /*
94 * Loop until the undo for the entire range has been laid down.
95 * Loop at least once (len might be 0 as a degenerate case).
96 */
97 for (;;) {
98 /*
99 * Fetch the layout offset in the UNDO FIFO, wrap it as
100 * necessary.
101 */
102 if (undomap->next_offset == undomap->alloc_offset) {
103 undomap->next_offset =
104 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
105 }
106 next_offset = undomap->next_offset;
107
108 /*
109 * This is a tail-chasing FIFO, when we hit the start of a new
110 * buffer we don't have to read it in.
111 */
112 if ((next_offset & HAMMER_BUFMASK) == 0) {
113 redo = hammer_bnew(hmp, next_offset, &error, &buffer);
114 hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000);
115 } else {
116 redo = hammer_bread(hmp, next_offset, &error, &buffer);
117 }
118 if (error)
119 break;
120 hammer_modify_buffer(NULL, buffer, NULL, 0);
121
122 /*
123 * Calculate how big a media structure fits up to the next
124 * alignment point and how large a data payload we can
125 * accomodate.
126 *
127 * If n calculates to 0 or negative there is no room for
128 * anything but a PAD.
129 */
130 bytes = HAMMER_UNDO_ALIGN -
131 ((int)next_offset & HAMMER_UNDO_MASK);
132 n = bytes -
133 (int)sizeof(struct hammer_fifo_redo) -
134 (int)sizeof(struct hammer_fifo_tail);
135
136 /*
137 * If available space is insufficient for any payload
138 * we have to lay down a PAD.
139 *
140 * The minimum PAD is 8 bytes and the head and tail will
141 * overlap each other in that case. PADs do not have
142 * sequence numbers or CRCs.
143 *
144 * A PAD may not start on a boundary. That is, every
145 * 512-byte block in the UNDO/REDO FIFO must begin with
146 * a record containing a sequence number.
147 */
148 if (n <= 0) {
149 KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
150 KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
151 tail = (void *)((char *)redo + bytes - sizeof(*tail));
152 if ((void *)redo != (void *)tail) {
153 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
154 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
155 tail->tail_size = bytes;
156 }
157 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
158 redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
159 redo->head.hdr_size = bytes;
160 /* NO CRC OR SEQ NO */
161 undomap->next_offset += bytes;
162 hammer_modify_buffer_done(buffer);
163 hammer_stats_redo += bytes;
164 continue;
165 }
166
167 /*
168 * When generating an inode-related REDO record we track
169 * the point in the UNDO/REDO FIFO containing the inode's
170 * earliest REDO record. See hammer_generate_redo_sync().
171 *
172 * redo_fifo_next is cleared when an inode is staged to
173 * the backend and then used to determine how to reassign
174 * redo_fifo_start after the inode flush completes.
175 */
176 if (ip) {
177 redo->redo_objid = ip->obj_id;
178 redo->redo_localization = ip->obj_localization;
179 if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
180 ip->redo_fifo_start = next_offset;
181 if (RB_INSERT(hammer_redo_rb_tree,
182 &hmp->rb_redo_root, ip)) {
183 panic("hammer_generate_redo: "
184 "cannot insert inode %p on "
185 "redo FIFO", ip);
186 }
187 ip->flags |= HAMMER_INODE_RDIRTY;
188 }
189 if (ip->redo_fifo_next == 0)
190 ip->redo_fifo_next = next_offset;
191 } else {
192 redo->redo_objid = 0;
193 redo->redo_localization = 0;
194 }
195
196 /*
197 * Calculate the actual payload and recalculate the size
198 * of the media structure as necessary. If no data buffer
199 * is supplied there is no payload.
200 */
201 if (base == NULL) {
202 n = 0;
203 } else if (n > len) {
204 n = len;
205 }
206 bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
207 ~HAMMER_HEAD_ALIGN_MASK) +
208 (int)sizeof(struct hammer_fifo_redo) +
209 (int)sizeof(struct hammer_fifo_tail);
210 if (hammer_debug_general & 0x0080) {
211 kprintf("redo %016llx %d %d\n",
212 (long long)next_offset, bytes, n);
213 }
214
215 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
216 redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
217 redo->head.hdr_size = bytes;
218 redo->head.hdr_seq = hmp->undo_seqno++;
219 redo->head.hdr_crc = 0;
220 redo->redo_mtime = trans->time;
221 redo->redo_offset = file_off;
222 redo->redo_flags = flags;
223
224 /*
225 * Incremental payload. If no payload we throw the entire
226 * len into redo_data_bytes and will not loop.
227 */
228 if (base) {
229 redo->redo_data_bytes = n;
230 bcopy(base, redo + 1, n);
231 len -= n;
232 base = (char *)base + n;
233 file_off += n;
234 } else {
235 redo->redo_data_bytes = len;
236 file_off += len;
237 len = 0;
238 }
239
240 tail = (void *)((char *)redo + bytes - sizeof(*tail));
241 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
242 tail->tail_type = HAMMER_HEAD_TYPE_REDO;
243 tail->tail_size = bytes;
244
245 KKASSERT(bytes >= sizeof(redo->head));
246 redo->head.hdr_crc = crc32(redo, HAMMER_FIFO_HEAD_CRCOFF) ^
247 crc32(&redo->head + 1, bytes - sizeof(redo->head));
248 undomap->next_offset += bytes;
249 hammer_stats_redo += bytes;
250
251 /*
252 * Before we finish off the buffer we have to deal with any
253 * junk between the end of the media structure we just laid
254 * down and the UNDO alignment boundary. We do this by laying
255 * down a dummy PAD. Even though we will probably overwrite
256 * it almost immediately we have to do this so recovery runs
257 * can iterate the UNDO space without having to depend on
258 * the indices in the volume header.
259 *
260 * This dummy PAD will be overwritten on the next undo so
261 * we do not adjust undomap->next_offset.
262 */
263 bytes = HAMMER_UNDO_ALIGN -
264 ((int)undomap->next_offset & HAMMER_UNDO_MASK);
265 if (bytes != HAMMER_UNDO_ALIGN) {
266 KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
267 redo = (void *)(tail + 1);
268 tail = (void *)((char *)redo + bytes - sizeof(*tail));
269 if ((void *)redo != (void *)tail) {
270 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
271 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
272 tail->tail_size = bytes;
273 }
274 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
275 redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
276 redo->head.hdr_size = bytes;
277 /* NO CRC OR SEQ NO */
278 }
279 hammer_modify_buffer_done(buffer);
280 if (len == 0)
281 break;
282 }
283 hammer_modify_volume_done(root_volume);
284 hammer_unlock(&hmp->undo_lock);
285
286 if (buffer)
287 hammer_rel_buffer(buffer, 0);
288
289 /*
290 * Make sure the nominal undo span contains at least one REDO_SYNC,
291 * otherwise the REDO recovery will not be triggered.
292 */
293 if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
294 flags != HAMMER_REDO_SYNC) {
295 hammer_generate_redo_sync(trans);
296 }
297
298 return(error);
299 }
300
301 /*
302 * Generate a REDO SYNC record. At least one such record must be generated
303 * in the nominal recovery span for the recovery code to be able to run
304 * REDOs outside of the span.
305 *
306 * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
307 * for all inodes with active REDOs. This changes dynamically as inodes
308 * get flushed.
309 *
310 * During recovery stage2 any new flush cycles must specify the original
311 * redo sync offset. That way a crash will re-run the REDOs, at least
312 * up to the point where the UNDO FIFO does not overwrite the area.
313 */
314 void
315 hammer_generate_redo_sync(hammer_transaction_t trans)
316 {
317 hammer_mount_t hmp = trans->hmp;
318 hammer_inode_t ip;
319 hammer_off_t redo_fifo_start;
320
321 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) {
322 ip = NULL;
323 redo_fifo_start = hmp->recover_stage2_offset;
324 } else {
325 ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
326 if (ip)
327 redo_fifo_start = ip->redo_fifo_start;
328 else
329 redo_fifo_start = 0;
330 }
331 if (redo_fifo_start) {
332 if (hammer_debug_io & 0x0004) {
333 kprintf("SYNC IP %p %016jx\n",
334 ip, (intmax_t)redo_fifo_start);
335 }
336 hammer_generate_redo(trans, NULL, redo_fifo_start,
337 HAMMER_REDO_SYNC, NULL, 0);
338 trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
339 }
340 }
341
342 /*
343 * This is called when an inode is queued to the backend.
344 */
345 void
346 hammer_redo_fifo_start_flush(hammer_inode_t ip)
347 {
348 ip->redo_fifo_next = 0;
349 }
350
351 /*
352 * This is called when an inode backend flush is finished. We have to make
353 * sure that RDIRTY is not set unless dirty bufs are present. Dirty bufs
354 * can get destroyed through operations such as truncations and leave
355 * us with a stale redo_fifo_next.
356 */
357 void
358 hammer_redo_fifo_end_flush(hammer_inode_t ip)
359 {
360 hammer_mount_t hmp = ip->hmp;
361
362 if (ip->flags & HAMMER_INODE_RDIRTY) {
363 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
364 ip->flags &= ~HAMMER_INODE_RDIRTY;
365 }
366 if ((ip->flags & HAMMER_INODE_BUFS) == 0)
367 ip->redo_fifo_next = 0;
368 if (ip->redo_fifo_next) {
369 ip->redo_fifo_start = ip->redo_fifo_next;
370 if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) {
371 panic("hammer_generate_redo: cannot reinsert "
372 "inode %p on redo FIFO",
373 ip);
374 }
375 ip->flags |= HAMMER_INODE_RDIRTY;
376 }
377 }
Cache object: f98f10c010e62220b18079eec71f5da5
|