1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
14 * Copyright (c) 2016 by Delphix. All rights reserved.
15 */
16
17 /*
18 * See abd.c for a general overview of the arc buffered data (ABD).
19 *
20 * Using a large proportion of scattered ABDs decreases ARC fragmentation since
21 * when we are at the limit of allocatable space, using equal-size chunks will
22 * allow us to quickly reclaim enough space for a new large allocation (assuming
23 * it is also scattered).
24 *
25 * ABDs are allocated scattered by default unless the caller uses
26 * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled.
27 */
28
29 #include <sys/abd_impl.h>
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/zio.h>
33 #include <sys/zfs_context.h>
34 #include <sys/zfs_znode.h>
35
36 typedef struct abd_stats {
37 kstat_named_t abdstat_struct_size;
38 kstat_named_t abdstat_scatter_cnt;
39 kstat_named_t abdstat_scatter_data_size;
40 kstat_named_t abdstat_scatter_chunk_waste;
41 kstat_named_t abdstat_linear_cnt;
42 kstat_named_t abdstat_linear_data_size;
43 } abd_stats_t;
44
45 static abd_stats_t abd_stats = {
46 /* Amount of memory occupied by all of the abd_t struct allocations */
47 { "struct_size", KSTAT_DATA_UINT64 },
48 /*
49 * The number of scatter ABDs which are currently allocated, excluding
50 * ABDs which don't own their data (for instance the ones which were
51 * allocated through abd_get_offset()).
52 */
53 { "scatter_cnt", KSTAT_DATA_UINT64 },
54 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
55 { "scatter_data_size", KSTAT_DATA_UINT64 },
56 /*
57 * The amount of space wasted at the end of the last chunk across all
58 * scatter ABDs tracked by scatter_cnt.
59 */
60 { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
61 /*
62 * The number of linear ABDs which are currently allocated, excluding
63 * ABDs which don't own their data (for instance the ones which were
64 * allocated through abd_get_offset() and abd_get_from_buf()). If an
65 * ABD takes ownership of its buf then it will become tracked.
66 */
67 { "linear_cnt", KSTAT_DATA_UINT64 },
68 /* Amount of data stored in all linear ABDs tracked by linear_cnt */
69 { "linear_data_size", KSTAT_DATA_UINT64 },
70 };
71
72 struct {
73 wmsum_t abdstat_struct_size;
74 wmsum_t abdstat_scatter_cnt;
75 wmsum_t abdstat_scatter_data_size;
76 wmsum_t abdstat_scatter_chunk_waste;
77 wmsum_t abdstat_linear_cnt;
78 wmsum_t abdstat_linear_data_size;
79 } abd_sums;
80
81 /*
82 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
83 * ABD's for. Smaller allocations will use linear ABD's which use
84 * zio_[data_]buf_alloc().
85 *
86 * Scatter ABD's use at least one page each, so sub-page allocations waste
87 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
88 * half of each page). Using linear ABD's for small allocations means that
89 * they will be put on slabs which contain many allocations.
90 *
91 * Linear ABDs for multi-page allocations are easier to use, and in some cases
92 * it allows to avoid buffer copying. But allocation and especially free
93 * of multi-page linear ABDs are expensive operations due to KVA mapping and
94 * unmapping, and with time they cause KVA fragmentations.
95 */
96 static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
97
98 #if defined(_KERNEL)
99 SYSCTL_DECL(_vfs_zfs);
100
101 SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
102 &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
103 SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
104 &zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
105 #endif
106
107 kmem_cache_t *abd_chunk_cache;
108 static kstat_t *abd_ksp;
109
110 /*
111 * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
112 * just a single zero'd page-sized buffer. This allows us to conserve
113 * memory by only using a single zero buffer for the scatter chunks.
114 */
115 abd_t *abd_zero_scatter = NULL;
116
117 static uint_t
118 abd_chunkcnt_for_bytes(size_t size)
119 {
120 return ((size + PAGE_MASK) >> PAGE_SHIFT);
121 }
122
123 static inline uint_t
124 abd_scatter_chunkcnt(abd_t *abd)
125 {
126 ASSERT(!abd_is_linear(abd));
127 return (abd_chunkcnt_for_bytes(
128 ABD_SCATTER(abd).abd_offset + abd->abd_size));
129 }
130
131 boolean_t
132 abd_size_alloc_linear(size_t size)
133 {
134 return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
135 }
136
137 void
138 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
139 {
140 uint_t n = abd_scatter_chunkcnt(abd);
141 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
142 int waste = (n << PAGE_SHIFT) - abd->abd_size;
143 if (op == ABDSTAT_INCR) {
144 ABDSTAT_BUMP(abdstat_scatter_cnt);
145 ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
146 ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
147 arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
148 } else {
149 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
150 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
151 ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
152 arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
153 }
154 }
155
156 void
157 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
158 {
159 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
160 if (op == ABDSTAT_INCR) {
161 ABDSTAT_BUMP(abdstat_linear_cnt);
162 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
163 } else {
164 ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
165 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
166 }
167 }
168
169 void
170 abd_verify_scatter(abd_t *abd)
171 {
172 uint_t i, n;
173
174 /*
175 * There is no scatter linear pages in FreeBSD so there is
176 * an error if the ABD has been marked as a linear page.
177 */
178 ASSERT(!abd_is_linear_page(abd));
179 ASSERT3U(ABD_SCATTER(abd).abd_offset, <, PAGE_SIZE);
180 n = abd_scatter_chunkcnt(abd);
181 for (i = 0; i < n; i++) {
182 ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
183 }
184 }
185
186 void
187 abd_alloc_chunks(abd_t *abd, size_t size)
188 {
189 uint_t i, n;
190
191 n = abd_chunkcnt_for_bytes(size);
192 for (i = 0; i < n; i++) {
193 ABD_SCATTER(abd).abd_chunks[i] =
194 kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
195 }
196 }
197
198 void
199 abd_free_chunks(abd_t *abd)
200 {
201 uint_t i, n;
202
203 n = abd_scatter_chunkcnt(abd);
204 for (i = 0; i < n; i++) {
205 kmem_cache_free(abd_chunk_cache,
206 ABD_SCATTER(abd).abd_chunks[i]);
207 }
208 }
209
210 abd_t *
211 abd_alloc_struct_impl(size_t size)
212 {
213 uint_t chunkcnt = abd_chunkcnt_for_bytes(size);
214 /*
215 * In the event we are allocating a gang ABD, the size passed in
216 * will be 0. We must make sure to set abd_size to the size of an
217 * ABD struct as opposed to an ABD scatter with 0 chunks. The gang
218 * ABD struct allocation accounts for an additional 24 bytes over
219 * a scatter ABD with 0 chunks.
220 */
221 size_t abd_size = MAX(sizeof (abd_t),
222 offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
223 abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
224 ASSERT3P(abd, !=, NULL);
225 ABDSTAT_INCR(abdstat_struct_size, abd_size);
226
227 return (abd);
228 }
229
230 void
231 abd_free_struct_impl(abd_t *abd)
232 {
233 uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
234 abd_scatter_chunkcnt(abd);
235 ssize_t size = MAX(sizeof (abd_t),
236 offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
237 kmem_free(abd, size);
238 ABDSTAT_INCR(abdstat_struct_size, -size);
239 }
240
241 /*
242 * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where
243 * each chunk in the scatterlist will be set to the same area.
244 */
245 _Static_assert(ZERO_REGION_SIZE >= PAGE_SIZE, "zero_region too small");
246 static void
247 abd_alloc_zero_scatter(void)
248 {
249 uint_t i, n;
250
251 n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
252 abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
253 abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
254 abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
255
256 ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
257
258 for (i = 0; i < n; i++) {
259 ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
260 __DECONST(void *, zero_region);
261 }
262
263 ABDSTAT_BUMP(abdstat_scatter_cnt);
264 ABDSTAT_INCR(abdstat_scatter_data_size, PAGE_SIZE);
265 }
266
267 static void
268 abd_free_zero_scatter(void)
269 {
270 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
271 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGE_SIZE);
272
273 abd_free_struct(abd_zero_scatter);
274 abd_zero_scatter = NULL;
275 }
276
277 static int
278 abd_kstats_update(kstat_t *ksp, int rw)
279 {
280 abd_stats_t *as = ksp->ks_data;
281
282 if (rw == KSTAT_WRITE)
283 return (EACCES);
284 as->abdstat_struct_size.value.ui64 =
285 wmsum_value(&abd_sums.abdstat_struct_size);
286 as->abdstat_scatter_cnt.value.ui64 =
287 wmsum_value(&abd_sums.abdstat_scatter_cnt);
288 as->abdstat_scatter_data_size.value.ui64 =
289 wmsum_value(&abd_sums.abdstat_scatter_data_size);
290 as->abdstat_scatter_chunk_waste.value.ui64 =
291 wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
292 as->abdstat_linear_cnt.value.ui64 =
293 wmsum_value(&abd_sums.abdstat_linear_cnt);
294 as->abdstat_linear_data_size.value.ui64 =
295 wmsum_value(&abd_sums.abdstat_linear_data_size);
296 return (0);
297 }
298
299 void
300 abd_init(void)
301 {
302 abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0,
303 NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
304
305 wmsum_init(&abd_sums.abdstat_struct_size, 0);
306 wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
307 wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
308 wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
309 wmsum_init(&abd_sums.abdstat_linear_cnt, 0);
310 wmsum_init(&abd_sums.abdstat_linear_data_size, 0);
311
312 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
313 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
314 if (abd_ksp != NULL) {
315 abd_ksp->ks_data = &abd_stats;
316 abd_ksp->ks_update = abd_kstats_update;
317 kstat_install(abd_ksp);
318 }
319
320 abd_alloc_zero_scatter();
321 }
322
323 void
324 abd_fini(void)
325 {
326 abd_free_zero_scatter();
327
328 if (abd_ksp != NULL) {
329 kstat_delete(abd_ksp);
330 abd_ksp = NULL;
331 }
332
333 wmsum_fini(&abd_sums.abdstat_struct_size);
334 wmsum_fini(&abd_sums.abdstat_scatter_cnt);
335 wmsum_fini(&abd_sums.abdstat_scatter_data_size);
336 wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
337 wmsum_fini(&abd_sums.abdstat_linear_cnt);
338 wmsum_fini(&abd_sums.abdstat_linear_data_size);
339
340 kmem_cache_destroy(abd_chunk_cache);
341 abd_chunk_cache = NULL;
342 }
343
344 void
345 abd_free_linear_page(abd_t *abd)
346 {
347 /*
348 * FreeBSD does not have scatter linear pages
349 * so there is an error.
350 */
351 VERIFY(0);
352 }
353
354 /*
355 * If we're going to use this ABD for doing I/O using the block layer, the
356 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
357 * plan to store this ABD in memory for a long period of time, we should
358 * allocate the ABD type that requires the least data copying to do the I/O.
359 *
360 * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
361 * using a scatter/gather list we should switch to that and replace this call
362 * with vanilla abd_alloc().
363 */
364 abd_t *
365 abd_alloc_for_io(size_t size, boolean_t is_metadata)
366 {
367 return (abd_alloc_linear(size, is_metadata));
368 }
369
370 abd_t *
371 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
372 size_t size)
373 {
374 abd_verify(sabd);
375 ASSERT3U(off, <=, sabd->abd_size);
376
377 size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
378 size_t chunkcnt = abd_chunkcnt_for_bytes(
379 (new_offset & PAGE_MASK) + size);
380
381 ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd));
382
383 /*
384 * If an abd struct is provided, it is only the minimum size. If we
385 * need additional chunks, we need to allocate a new struct.
386 */
387 if (abd != NULL &&
388 offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) >
389 sizeof (abd_t)) {
390 abd = NULL;
391 }
392
393 if (abd == NULL)
394 abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT);
395
396 /*
397 * Even if this buf is filesystem metadata, we only track that
398 * if we own the underlying data buffer, which is not true in
399 * this case. Therefore, we don't ever use ABD_FLAG_META here.
400 */
401
402 ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK;
403
404 /* Copy the scatterlist starting at the correct offset */
405 (void) memcpy(&ABD_SCATTER(abd).abd_chunks,
406 &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT],
407 chunkcnt * sizeof (void *));
408
409 return (abd);
410 }
411
412 /*
413 * Initialize the abd_iter.
414 */
415 void
416 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
417 {
418 ASSERT(!abd_is_gang(abd));
419 abd_verify(abd);
420 aiter->iter_abd = abd;
421 aiter->iter_pos = 0;
422 aiter->iter_mapaddr = NULL;
423 aiter->iter_mapsize = 0;
424 }
425
426 /*
427 * This is just a helper function to see if we have exhausted the
428 * abd_iter and reached the end.
429 */
430 boolean_t
431 abd_iter_at_end(struct abd_iter *aiter)
432 {
433 return (aiter->iter_pos == aiter->iter_abd->abd_size);
434 }
435
436 /*
437 * Advance the iterator by a certain amount. Cannot be called when a chunk is
438 * in use. This can be safely called when the aiter has already exhausted, in
439 * which case this does nothing.
440 */
441 void
442 abd_iter_advance(struct abd_iter *aiter, size_t amount)
443 {
444 ASSERT3P(aiter->iter_mapaddr, ==, NULL);
445 ASSERT0(aiter->iter_mapsize);
446
447 /* There's nothing left to advance to, so do nothing */
448 if (abd_iter_at_end(aiter))
449 return;
450
451 aiter->iter_pos += amount;
452 }
453
454 /*
455 * Map the current chunk into aiter. This can be safely called when the aiter
456 * has already exhausted, in which case this does nothing.
457 */
458 void
459 abd_iter_map(struct abd_iter *aiter)
460 {
461 void *paddr;
462
463 ASSERT3P(aiter->iter_mapaddr, ==, NULL);
464 ASSERT0(aiter->iter_mapsize);
465
466 /* There's nothing left to iterate over, so do nothing */
467 if (abd_iter_at_end(aiter))
468 return;
469
470 abd_t *abd = aiter->iter_abd;
471 size_t offset = aiter->iter_pos;
472 if (abd_is_linear(abd)) {
473 aiter->iter_mapsize = abd->abd_size - offset;
474 paddr = ABD_LINEAR_BUF(abd);
475 } else {
476 offset += ABD_SCATTER(abd).abd_offset;
477 paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT];
478 offset &= PAGE_MASK;
479 aiter->iter_mapsize = MIN(PAGE_SIZE - offset,
480 abd->abd_size - aiter->iter_pos);
481 }
482 aiter->iter_mapaddr = (char *)paddr + offset;
483 }
484
485 /*
486 * Unmap the current chunk from aiter. This can be safely called when the aiter
487 * has already exhausted, in which case this does nothing.
488 */
489 void
490 abd_iter_unmap(struct abd_iter *aiter)
491 {
492 if (!abd_iter_at_end(aiter)) {
493 ASSERT3P(aiter->iter_mapaddr, !=, NULL);
494 ASSERT3U(aiter->iter_mapsize, >, 0);
495 }
496
497 aiter->iter_mapaddr = NULL;
498 aiter->iter_mapsize = 0;
499 }
500
501 void
502 abd_cache_reap_now(void)
503 {
504 kmem_cache_reap_soon(abd_chunk_cache);
505 }
Cache object: 9ed9799ffa57a8cefc9c4381c9cabfdc
|