1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dmu_zfetch.h>
34 #include <sys/dmu.h>
35 #include <sys/dbuf.h>
36 #include <sys/kstat.h>
37 #include <sys/wmsum.h>
38
39 /*
40 * This tunable disables predictive prefetch. Note that it leaves "prescient"
41 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
42 * prescient prefetch never issues i/os that end up not being needed,
43 * so it can't hurt performance.
44 */
45
46 static int zfs_prefetch_disable = B_FALSE;
47
48 /* max # of streams per zfetch */
49 static unsigned int zfetch_max_streams = 8;
50 /* min time before stream reclaim */
51 static unsigned int zfetch_min_sec_reap = 1;
52 /* max time before stream delete */
53 static unsigned int zfetch_max_sec_reap = 2;
54 /* min bytes to prefetch per stream (default 4MB) */
55 static unsigned int zfetch_min_distance = 4 * 1024 * 1024;
56 /* max bytes to prefetch per stream (default 64MB) */
57 unsigned int zfetch_max_distance = 64 * 1024 * 1024;
58 /* max bytes to prefetch indirects for per stream (default 64MB) */
59 unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
60 /* max number of bytes in an array_read in which we allow prefetching (1MB) */
61 uint64_t zfetch_array_rd_sz = 1024 * 1024;
62
63 typedef struct zfetch_stats {
64 kstat_named_t zfetchstat_hits;
65 kstat_named_t zfetchstat_misses;
66 kstat_named_t zfetchstat_max_streams;
67 kstat_named_t zfetchstat_io_issued;
68 } zfetch_stats_t;
69
70 static zfetch_stats_t zfetch_stats = {
71 { "hits", KSTAT_DATA_UINT64 },
72 { "misses", KSTAT_DATA_UINT64 },
73 { "max_streams", KSTAT_DATA_UINT64 },
74 { "io_issued", KSTAT_DATA_UINT64 },
75 };
76
77 struct {
78 wmsum_t zfetchstat_hits;
79 wmsum_t zfetchstat_misses;
80 wmsum_t zfetchstat_max_streams;
81 wmsum_t zfetchstat_io_issued;
82 } zfetch_sums;
83
84 #define ZFETCHSTAT_BUMP(stat) \
85 wmsum_add(&zfetch_sums.stat, 1)
86 #define ZFETCHSTAT_ADD(stat, val) \
87 wmsum_add(&zfetch_sums.stat, val)
88
89
90 static kstat_t *zfetch_ksp;
91
92 static int
93 zfetch_kstats_update(kstat_t *ksp, int rw)
94 {
95 zfetch_stats_t *zs = ksp->ks_data;
96
97 if (rw == KSTAT_WRITE)
98 return (EACCES);
99 zs->zfetchstat_hits.value.ui64 =
100 wmsum_value(&zfetch_sums.zfetchstat_hits);
101 zs->zfetchstat_misses.value.ui64 =
102 wmsum_value(&zfetch_sums.zfetchstat_misses);
103 zs->zfetchstat_max_streams.value.ui64 =
104 wmsum_value(&zfetch_sums.zfetchstat_max_streams);
105 zs->zfetchstat_io_issued.value.ui64 =
106 wmsum_value(&zfetch_sums.zfetchstat_io_issued);
107 return (0);
108 }
109
110 void
111 zfetch_init(void)
112 {
113 wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
114 wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
115 wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
116 wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
117
118 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
119 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
120 KSTAT_FLAG_VIRTUAL);
121
122 if (zfetch_ksp != NULL) {
123 zfetch_ksp->ks_data = &zfetch_stats;
124 zfetch_ksp->ks_update = zfetch_kstats_update;
125 kstat_install(zfetch_ksp);
126 }
127 }
128
129 void
130 zfetch_fini(void)
131 {
132 if (zfetch_ksp != NULL) {
133 kstat_delete(zfetch_ksp);
134 zfetch_ksp = NULL;
135 }
136
137 wmsum_fini(&zfetch_sums.zfetchstat_hits);
138 wmsum_fini(&zfetch_sums.zfetchstat_misses);
139 wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
140 wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
141 }
142
143 /*
144 * This takes a pointer to a zfetch structure and a dnode. It performs the
145 * necessary setup for the zfetch structure, grokking data from the
146 * associated dnode.
147 */
148 void
149 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
150 {
151 if (zf == NULL)
152 return;
153 zf->zf_dnode = dno;
154 zf->zf_numstreams = 0;
155
156 list_create(&zf->zf_stream, sizeof (zstream_t),
157 offsetof(zstream_t, zs_node));
158
159 mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
160 }
161
162 static void
163 dmu_zfetch_stream_fini(zstream_t *zs)
164 {
165 ASSERT(!list_link_active(&zs->zs_node));
166 zfs_refcount_destroy(&zs->zs_callers);
167 zfs_refcount_destroy(&zs->zs_refs);
168 kmem_free(zs, sizeof (*zs));
169 }
170
171 static void
172 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
173 {
174 ASSERT(MUTEX_HELD(&zf->zf_lock));
175 list_remove(&zf->zf_stream, zs);
176 zf->zf_numstreams--;
177 membar_producer();
178 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
179 dmu_zfetch_stream_fini(zs);
180 }
181
182 /*
183 * Clean-up state associated with a zfetch structure (e.g. destroy the
184 * streams). This doesn't free the zfetch_t itself, that's left to the caller.
185 */
186 void
187 dmu_zfetch_fini(zfetch_t *zf)
188 {
189 zstream_t *zs;
190
191 mutex_enter(&zf->zf_lock);
192 while ((zs = list_head(&zf->zf_stream)) != NULL)
193 dmu_zfetch_stream_remove(zf, zs);
194 mutex_exit(&zf->zf_lock);
195 list_destroy(&zf->zf_stream);
196 mutex_destroy(&zf->zf_lock);
197
198 zf->zf_dnode = NULL;
199 }
200
201 /*
202 * If there aren't too many active streams already, create one more.
203 * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
204 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
205 * The "blkid" argument is the next block that we expect this stream to access.
206 */
207 static void
208 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
209 {
210 zstream_t *zs, *zs_next, *zs_old = NULL;
211 hrtime_t now = gethrtime(), t;
212
213 ASSERT(MUTEX_HELD(&zf->zf_lock));
214
215 /*
216 * Delete too old streams, reusing the first found one.
217 */
218 t = now - SEC2NSEC(zfetch_max_sec_reap);
219 for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
220 zs_next = list_next(&zf->zf_stream, zs);
221 /*
222 * Skip if still active. 1 -- zf_stream reference.
223 */
224 if (zfs_refcount_count(&zs->zs_refs) != 1)
225 continue;
226 if (zs->zs_atime > t)
227 continue;
228 if (zs_old)
229 dmu_zfetch_stream_remove(zf, zs);
230 else
231 zs_old = zs;
232 }
233 if (zs_old) {
234 zs = zs_old;
235 goto reuse;
236 }
237
238 /*
239 * The maximum number of streams is normally zfetch_max_streams,
240 * but for small files we lower it such that it's at least possible
241 * for all the streams to be non-overlapping.
242 */
243 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
244 zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
245 zfetch_max_distance));
246 if (zf->zf_numstreams >= max_streams) {
247 t = now - SEC2NSEC(zfetch_min_sec_reap);
248 for (zs = list_head(&zf->zf_stream); zs != NULL;
249 zs = list_next(&zf->zf_stream, zs)) {
250 if (zfs_refcount_count(&zs->zs_refs) != 1)
251 continue;
252 if (zs->zs_atime > t)
253 continue;
254 if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
255 zs_old = zs;
256 }
257 if (zs_old) {
258 zs = zs_old;
259 goto reuse;
260 }
261 ZFETCHSTAT_BUMP(zfetchstat_max_streams);
262 return;
263 }
264
265 zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
266 zs->zs_fetch = zf;
267 zfs_refcount_create(&zs->zs_callers);
268 zfs_refcount_create(&zs->zs_refs);
269 /* One reference for zf_stream. */
270 zfs_refcount_add(&zs->zs_refs, NULL);
271 zf->zf_numstreams++;
272 list_insert_head(&zf->zf_stream, zs);
273
274 reuse:
275 zs->zs_blkid = blkid;
276 zs->zs_pf_dist = 0;
277 zs->zs_pf_start = blkid;
278 zs->zs_pf_end = blkid;
279 zs->zs_ipf_dist = 0;
280 zs->zs_ipf_start = blkid;
281 zs->zs_ipf_end = blkid;
282 /* Allow immediate stream reuse until first hit. */
283 zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
284 zs->zs_missed = B_FALSE;
285 zs->zs_more = B_FALSE;
286 }
287
288 static void
289 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
290 {
291 zstream_t *zs = arg;
292
293 if (io_issued && level == 0 && blkid < zs->zs_blkid)
294 zs->zs_more = B_TRUE;
295 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
296 dmu_zfetch_stream_fini(zs);
297 }
298
299 /*
300 * This is the predictive prefetch entry point. dmu_zfetch_prepare()
301 * associates dnode access specified with blkid and nblks arguments with
302 * prefetch stream, predicts further accesses based on that stats and returns
303 * the stream pointer on success. That pointer must later be passed to
304 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
305 * release it. dmu_zfetch() is a wrapper for simple cases when window between
306 * prediction and prefetch initiation is not needed.
307 * fetch_data argument specifies whether actual data blocks should be fetched:
308 * FALSE -- prefetch only indirect blocks for predicted data blocks;
309 * TRUE -- prefetch predicted data blocks plus following indirect blocks.
310 */
311 zstream_t *
312 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
313 boolean_t fetch_data, boolean_t have_lock)
314 {
315 zstream_t *zs;
316 spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
317
318 if (zfs_prefetch_disable)
319 return (NULL);
320 /*
321 * If we haven't yet loaded the indirect vdevs' mappings, we
322 * can only read from blocks that we carefully ensure are on
323 * concrete vdevs (or previously-loaded indirect vdevs). So we
324 * can't allow the predictive prefetcher to attempt reads of other
325 * blocks (e.g. of the MOS's dnode object).
326 */
327 if (!spa_indirect_vdevs_loaded(spa))
328 return (NULL);
329
330 /*
331 * As a fast path for small (single-block) files, ignore access
332 * to the first block.
333 */
334 if (!have_lock && blkid == 0)
335 return (NULL);
336
337 if (!have_lock)
338 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
339
340 /*
341 * A fast path for small files for which no prefetch will
342 * happen.
343 */
344 uint64_t maxblkid = zf->zf_dnode->dn_maxblkid;
345 if (maxblkid < 2) {
346 if (!have_lock)
347 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
348 return (NULL);
349 }
350 mutex_enter(&zf->zf_lock);
351
352 /*
353 * Find matching prefetch stream. Depending on whether the accesses
354 * are block-aligned, first block of the new access may either follow
355 * the last block of the previous access, or be equal to it.
356 */
357 for (zs = list_head(&zf->zf_stream); zs != NULL;
358 zs = list_next(&zf->zf_stream, zs)) {
359 if (blkid == zs->zs_blkid) {
360 break;
361 } else if (blkid + 1 == zs->zs_blkid) {
362 blkid++;
363 nblks--;
364 break;
365 }
366 }
367
368 /*
369 * If the file is ending, remove the matching stream if found.
370 * If not found then it is too late to create a new one now.
371 */
372 uint64_t end_of_access_blkid = blkid + nblks;
373 if (end_of_access_blkid >= maxblkid) {
374 if (zs != NULL)
375 dmu_zfetch_stream_remove(zf, zs);
376 mutex_exit(&zf->zf_lock);
377 if (!have_lock)
378 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
379 return (NULL);
380 }
381
382 /* Exit if we already prefetched this block before. */
383 if (nblks == 0) {
384 mutex_exit(&zf->zf_lock);
385 if (!have_lock)
386 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
387 return (NULL);
388 }
389
390 if (zs == NULL) {
391 /*
392 * This access is not part of any existing stream. Create
393 * a new stream for it.
394 */
395 dmu_zfetch_stream_create(zf, end_of_access_blkid);
396 mutex_exit(&zf->zf_lock);
397 if (!have_lock)
398 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
399 ZFETCHSTAT_BUMP(zfetchstat_misses);
400 return (NULL);
401 }
402
403 /*
404 * This access was to a block that we issued a prefetch for on
405 * behalf of this stream. Calculate further prefetch distances.
406 *
407 * Start prefetch from the demand access size (nblks). Double the
408 * distance every access up to zfetch_min_distance. After that only
409 * if needed increase the distance by 1/8 up to zfetch_max_distance.
410 */
411 unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift;
412 unsigned int pf_nblks;
413 if (fetch_data) {
414 if (unlikely(zs->zs_pf_dist < nbytes))
415 zs->zs_pf_dist = nbytes;
416 else if (zs->zs_pf_dist < zfetch_min_distance)
417 zs->zs_pf_dist *= 2;
418 else if (zs->zs_more)
419 zs->zs_pf_dist += zs->zs_pf_dist / 8;
420 zs->zs_more = B_FALSE;
421 if (zs->zs_pf_dist > zfetch_max_distance)
422 zs->zs_pf_dist = zfetch_max_distance;
423 pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift;
424 } else {
425 pf_nblks = 0;
426 }
427 if (zs->zs_pf_start < end_of_access_blkid)
428 zs->zs_pf_start = end_of_access_blkid;
429 if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
430 zs->zs_pf_end = end_of_access_blkid + pf_nblks;
431
432 /*
433 * Do the same for indirects, starting where we will stop reading
434 * data blocks (and the indirects that point to them).
435 */
436 if (unlikely(zs->zs_ipf_dist < nbytes))
437 zs->zs_ipf_dist = nbytes;
438 else
439 zs->zs_ipf_dist *= 2;
440 if (zs->zs_ipf_dist > zfetch_max_idistance)
441 zs->zs_ipf_dist = zfetch_max_idistance;
442 pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift;
443 if (zs->zs_ipf_start < zs->zs_pf_end)
444 zs->zs_ipf_start = zs->zs_pf_end;
445 if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
446 zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
447
448 zs->zs_blkid = end_of_access_blkid;
449 /* Protect the stream from reclamation. */
450 zs->zs_atime = gethrtime();
451 zfs_refcount_add(&zs->zs_refs, NULL);
452 /* Count concurrent callers. */
453 zfs_refcount_add(&zs->zs_callers, NULL);
454 mutex_exit(&zf->zf_lock);
455
456 if (!have_lock)
457 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
458
459 ZFETCHSTAT_BUMP(zfetchstat_hits);
460 return (zs);
461 }
462
463 void
464 dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
465 {
466 zfetch_t *zf = zs->zs_fetch;
467 int64_t pf_start, pf_end, ipf_start, ipf_end;
468 int epbs, issued;
469
470 if (missed)
471 zs->zs_missed = missed;
472
473 /*
474 * Postpone the prefetch if there are more concurrent callers.
475 * It happens when multiple requests are waiting for the same
476 * indirect block. The last one will run the prefetch for all.
477 */
478 if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
479 /* Drop reference taken in dmu_zfetch_prepare(). */
480 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
481 dmu_zfetch_stream_fini(zs);
482 return;
483 }
484
485 mutex_enter(&zf->zf_lock);
486 if (zs->zs_missed) {
487 pf_start = zs->zs_pf_start;
488 pf_end = zs->zs_pf_start = zs->zs_pf_end;
489 } else {
490 pf_start = pf_end = 0;
491 }
492 ipf_start = zs->zs_ipf_start;
493 ipf_end = zs->zs_ipf_start = zs->zs_ipf_end;
494 mutex_exit(&zf->zf_lock);
495 ASSERT3S(pf_start, <=, pf_end);
496 ASSERT3S(ipf_start, <=, ipf_end);
497
498 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
499 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
500 ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
501 ASSERT3S(ipf_start, <=, ipf_end);
502 issued = pf_end - pf_start + ipf_end - ipf_start;
503 if (issued > 1) {
504 /* More references on top of taken in dmu_zfetch_prepare(). */
505 for (int i = 0; i < issued - 1; i++)
506 zfs_refcount_add(&zs->zs_refs, NULL);
507 } else if (issued == 0) {
508 /* Some other thread has done our work, so drop the ref. */
509 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
510 dmu_zfetch_stream_fini(zs);
511 return;
512 }
513
514 if (!have_lock)
515 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
516
517 issued = 0;
518 for (int64_t blk = pf_start; blk < pf_end; blk++) {
519 issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
520 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
521 }
522 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
523 issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
524 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
525 }
526
527 if (!have_lock)
528 rw_exit(&zf->zf_dnode->dn_struct_rwlock);
529
530 if (issued)
531 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
532 }
533
534 void
535 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
536 boolean_t missed, boolean_t have_lock)
537 {
538 zstream_t *zs;
539
540 zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
541 if (zs)
542 dmu_zfetch_run(zs, missed, have_lock);
543 }
544
545 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
546 "Disable all ZFS prefetching");
547
548 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
549 "Max number of streams per zfetch");
550
551 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
552 "Min time before stream reclaim");
553
554 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW,
555 "Max time before stream delete");
556
557 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW,
558 "Min bytes to prefetch per stream");
559
560 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
561 "Max bytes to prefetch per stream");
562
563 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
564 "Max bytes to prefetch indirects for per stream");
565
566 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW,
567 "Number of bytes in a array_read");
Cache object: 796e1dcc06ee0a2da907106e5d84c931
|