FreeBSD/Linux Kernel Cross Reference
sys/sys/wapbl.h
1 /* $NetBSD: wapbl.h,v 1.2 2008/07/31 05:38:06 simonb Exp $ */
2
3 /*-
4 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #ifndef _SYS_WAPBL_H
33 #define _SYS_WAPBL_H
34
35 #include <sys/mutex.h>
36
37 #include <miscfs/specfs/specdev.h>
38
39 /* This header file describes the api and data structures for
40 * write ahead physical block logging (WAPBL) support.
41 */
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_wapbl.h"
45 #endif
46
47 #ifdef WAPBL_DEBUG
48 #ifndef WAPBL_DEBUG_PRINT
49 #define WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN)
50 #endif
51
52 #if 0
53 #define WAPBL_DEBUG_BUFBYTES
54 #define WAPBL_DEBUG_SERIALIZE
55 #endif
56
57 #endif
58
59 #ifdef WAPBL_DEBUG_PRINT
60
61 enum {
62 WAPBL_PRINT_OPEN = 0x1,
63 WAPBL_PRINT_FLUSH = 0x2,
64 WAPBL_PRINT_TRUNCATE = 0x4,
65 WAPBL_PRINT_TRANSACTION = 0x8,
66 WAPBL_PRINT_BUFFER = 0x10,
67 WAPBL_PRINT_BUFFER2 = 0x20,
68 WAPBL_PRINT_ALLOC = 0x40,
69 WAPBL_PRINT_INODE = 0x80,
70 WAPBL_PRINT_WRITE = 0x100,
71 WAPBL_PRINT_IO = 0x200,
72 WAPBL_PRINT_REPLAY = 0x400,
73 WAPBL_PRINT_ERROR = 0x800,
74 WAPBL_PRINT_DISCARD = 0x1000,
75 WAPBL_PRINT_BIODONE = 0x2000,
76 };
77
78 #define WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf a
79 extern int wapbl_debug_print;
80 #else
81 #define WAPBL_PRINTF(mask, a)
82 #endif
83
84 /****************************************************************/
85
86 /* The WAPBL journal layout.
87 *
88 * The journal consists of a header followed by a circular buffer
89 * region. The circular data area is described by the header
90 * wc_circ_off, wc_circ_size, wc_head and wc_tail fields as bytes
91 * from the start of the journal header. New records are inserted
92 * at wc_head and the oldest valid record can be found at wc_tail.
93 * When ((wc_head == wc_tail) && (wc_head == 0)), the journal is empty.
94 * The condition of ((wc_head == wc_tail) && (wc_head != 0))
95 * indicates a full journal, although this condition is rare.
96 *
97 * The journal header as well as its records are marked by a 32bit
98 * type tag and length for ease of parsing. Journal records are
99 * padded so as to fall on journal device block boundaries.
100 * (XXX i think there is currently a bug wrt WC_BLOCKS not ending
101 * correctly on a journal device block boundary. this would need
102 * to be fixed if the journal blocksize does not match filesystem.)
103 */
104
105 /*
106 * The following are the 4 record types used by the journal:
107 * Each tag indicates journal data organized by one of the
108 * structures used below.
109 */
110 enum {
111 WAPBL_WC_HEADER = 0x5741424c, /* "WABL", struct wapbl_wc_header */
112 WAPBL_WC_INODES, /* struct wapbl_wc_inodelist */
113 WAPBL_WC_REVOCATIONS, /* struct wapbl_wc_blocklist */
114 WAPBL_WC_BLOCKS, /* struct wapbl_wc_blocklist */
115 };
116
117 /* null entry (on disk) */
118 /* This structure isn't used directly, but shares its header
119 * layout with all the other log structures for the purpose
120 * of reading a log structure and determining its type
121 */
122 struct wapbl_wc_null {
123 uint32_t wc_type; /* WAPBL_WC_* */
124 int32_t wc_len;
125 uint8_t wc_spare[0]; /* actually longer */
126 };
127
128 /* journal header (on-disk)
129 * This record is found at the start of the
130 * journal, but not within the circular buffer region. As well as
131 * describing the journal parameters and matching filesystem, it
132 * additionally serves as the atomic update record for journal
133 * updates.
134 */
135 struct wapbl_wc_header {
136 uint32_t wc_type; /* WAPBL_WC_HEADER log magic number */
137 int32_t wc_len; /* length of this journal entry */
138 uint32_t wc_checksum;
139 uint32_t wc_generation;
140 int32_t wc_fsid[2];
141 uint64_t wc_time;
142 uint32_t wc_timensec;
143 uint32_t wc_version;
144 uint32_t wc_log_dev_bshift;
145 uint32_t wc_fs_dev_bshift;
146 int64_t wc_head;
147 int64_t wc_tail;
148 int64_t wc_circ_off; /* offset of of circ buffer region */
149 int64_t wc_circ_size; /* size of circular buffer region */
150 uint8_t wc_spare[0]; /* actually longer */
151 };
152
153 /* list of blocks (on disk)
154 * This record is used to describe a set of filesystem blocks,
155 * and is used with two type tags, WAPBL_WC_BLOCKS and
156 * WAPBL_WC_REVOCATIONS.
157 *
158 * For WAPBL_WC_BLOCKS, a copy of each listed block can be found
159 * starting at the next log device blocksize boundary. starting at
160 * one log device block since the start of the record. This contains
161 * the bulk of the filesystem journal data which is written using
162 * these records before being written into the filesystem.
163 *
164 * The WAPBL_WC_REVOCATIONS record is used to indicate that any
165 * previously listed blocks should not be written into the filesystem.
166 * This is important so that deallocated and reallocated data blocks
167 * do not get overwritten with stale data from the journal. The
168 * revocation records to not contain a copy of any actual block data.
169 */
170 struct wapbl_wc_blocklist {
171 uint32_t wc_type; /* WAPBL_WC_{REVOCATIONS,BLOCKS} */
172 int32_t wc_len;
173 int32_t wc_blkcount;
174 int32_t wc_unused;
175 struct {
176 int64_t wc_daddr;
177 int32_t wc_unused;
178 int32_t wc_dlen;
179 } wc_blocks[0]; /* actually longer */
180 };
181
182 /* list of inodes (on disk)
183 * This record is used to describe the set of inodes which
184 * may be allocated but are unlinked. Inodes end up listed here
185 * while they are in the process of being initialized and
186 * deinitialized. Inodes unlinked while in use by a process
187 * will be listed here and the actual deletion must be completed
188 * on journal replay.
189 */
190 struct wapbl_wc_inodelist {
191 uint32_t wc_type; /* WAPBL_WC_INODES */
192 int32_t wc_len;
193 int32_t wc_inocnt;
194 int32_t wc_clear; /* set if previously listed inodes
195 hould be ignored */
196 struct {
197 uint32_t wc_inumber;
198 uint32_t wc_imode;
199 } wc_inodes[0]; /* actually longer */
200 };
201
202 /****************************************************************/
203
204 #include <sys/queue.h>
205 #include <sys/vnode.h>
206 #include <sys/buf.h>
207
208 typedef void (*wapbl_flush_fn_t)(struct mount *, daddr_t *, int *, int);
209
210 #ifdef _KERNEL
211
212 struct wapbl_entry;
213 struct wapbl_wc_header;
214 struct wapbl_replay;
215 struct wapbl;
216
217 /*
218 * This structure holds per transaction log information
219 */
220 struct wapbl_entry {
221 struct wapbl *we_wapbl;
222 SIMPLEQ_ENTRY(wapbl_entry) we_entries;
223 size_t we_bufcount; /* Count of unsynced buffers */
224 size_t we_reclaimable_bytes; /* Number on disk bytes for this
225 transaction */
226 int we_error;
227 #ifdef WAPBL_DEBUG_BUFBYTES
228 size_t we_unsynced_bufbytes; /* Byte count of unsynced buffers */
229 #endif
230 };
231
232 void wapbl_init(void);
233
234 /* Start using a log */
235 int wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t,
236 size_t, size_t, struct wapbl_replay *,
237 wapbl_flush_fn_t, wapbl_flush_fn_t);
238
239 /* Discard the current transaction, potentially dangerous */
240 void wapbl_discard(struct wapbl *);
241
242 /* stop using a log */
243 int wapbl_stop(struct wapbl *, int);
244
245 /*
246 * Begin a new transaction or increment transaction recursion
247 * level if called while a transaction is already in progress
248 * by the current process.
249 */
250 int wapbl_begin(struct wapbl *, const char *, int);
251
252
253 /* End a transaction or decrement the transaction recursion level */
254 void wapbl_end(struct wapbl *);
255
256 /*
257 * Add a new buffer to the current transaction. The buffers
258 * data will be copied to the current transaction log and the
259 * buffer will be marked B_LOCKED so that it will not be
260 * flushed to disk by the syncer or reallocated.
261 */
262 void wapbl_add_buf(struct wapbl *, struct buf *);
263
264 /* Remove a buffer from the current transaction. */
265 void wapbl_remove_buf(struct wapbl *, struct buf *);
266
267 void wapbl_resize_buf(struct wapbl *, struct buf *, long, long);
268
269 /*
270 * This will flush all completed transactions to disk and
271 * start asynchronous writes on the associated buffers
272 */
273 int wapbl_flush(struct wapbl *, int);
274
275 /*
276 * Inodes that are allocated but have zero link count
277 * must be registered with the current transaction
278 * so they may be recorded in the log and cleaned up later.
279 * registration/unregistration of ino numbers already registered is ok.
280 */
281 void wapbl_register_inode(struct wapbl *, ino_t, mode_t);
282 void wapbl_unregister_inode(struct wapbl *, ino_t, mode_t);
283
284 /*
285 * Metadata block deallocations must be registered so
286 * that revocations records can be written and to prevent
287 * the corresponding blocks from being reused as data
288 * blocks until the log is on disk.
289 */
290 void wapbl_register_deallocation(struct wapbl *, daddr_t, int);
291
292 void wapbl_jlock_assert(struct wapbl *wl);
293 void wapbl_junlock_assert(struct wapbl *wl);
294
295 void wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...));
296
297 #if defined(WAPBL_DEBUG) || defined(DDB)
298 void wapbl_dump(struct wapbl *);
299 #endif
300
301 void wapbl_biodone(struct buf *);
302
303 extern struct wapbl_ops wapbl_ops;
304
305 static __inline struct mount *
306 wapbl_vptomp(struct vnode *vp)
307 {
308 struct mount *mp;
309
310 mp = NULL;
311 if (vp != NULL) {
312 if (vp->v_type == VBLK)
313 mp = vp->v_specmountpoint;
314 else
315 mp = vp->v_mount;
316 }
317
318 return mp;
319 }
320
321 static __inline bool
322 wapbl_vphaswapbl(struct vnode *vp)
323 {
324 struct mount *mp;
325
326 if (vp == NULL)
327 return false;
328
329 mp = wapbl_vptomp(vp);
330 if (mp && mp->mnt_wapbl)
331 return true;
332 else
333 return false;
334 }
335
336 #endif /* _KERNEL */
337
338 /****************************************************************/
339 /* Replay support */
340
341 struct wapbl_replay {
342 struct vnode *wr_logvp;
343 struct vnode *wr_devvp;
344 daddr_t wr_logpbn;
345
346 struct wapbl_wc_header wr_wc_header;
347 void *wr_scratch;
348
349 LIST_HEAD(wapbl_blk_head, wapbl_blk) *wr_blkhash;
350 u_long wr_blkhashmask;
351 int wr_blkhashcnt;
352
353 off_t wr_inodeshead;
354 off_t wr_inodestail;
355 int wr_inodescnt;
356 struct {
357 uint32_t wr_inumber;
358 uint32_t wr_imode;
359 } *wr_inodes;
360 };
361
362 #define wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0)
363
364 int wapbl_replay_isopen1(struct wapbl_replay *);
365 int wapbl_replay_start(struct wapbl_replay **, struct vnode *,
366 daddr_t, size_t, size_t);
367 void wapbl_replay_stop(struct wapbl_replay *);
368 void wapbl_replay_free(struct wapbl_replay *);
369 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
370 int wapbl_replay_write(struct wapbl_replay *, struct vnode *);
371 int wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long);
372
373 /****************************************************************/
374
375 /* Supply this to provide i/o support */
376 int wapbl_write(void *, size_t, struct vnode *, daddr_t);
377 int wapbl_read(void *, size_t, struct vnode *, daddr_t);
378
379 /****************************************************************/
380
381 #endif /* !_SYS_WAPBL_H */
Cache object: b7e00182d2c4bd1223ef7522e68ee7af
|