The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vfs/hammer/hammer_dedup.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
    3  *
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Ilya Dryomov <idryomov@gmail.com>
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  *
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in
   15  *    the documentation and/or other materials provided with the
   16  *    distribution.
   17  * 3. Neither the name of The DragonFly Project nor the names of its
   18  *    contributors may be used to endorse or promote products derived
   19  *    from this software without specific, prior written permission.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  */
   34 
   35 #include "hammer.h"
   36 
   37 static __inline int validate_zone(hammer_off_t data_offset);
   38 
   39 int
   40 hammer_ioc_dedup(hammer_transaction_t trans, hammer_inode_t ip,
   41                  struct hammer_ioc_dedup *dedup)
   42 {
   43         struct hammer_cursor cursor1, cursor2;
   44         int error;
   45         int seq;
   46 
   47         /*
   48          * Enforce hammer filesystem version requirements
   49          */
   50         if (trans->hmp->version < HAMMER_VOL_VERSION_FIVE) {
   51                 kprintf("hammer: Filesystem must be upgraded to v5 "
   52                         "before you can run dedup\n");
   53                 return (EOPNOTSUPP);
   54         }
   55 
   56         /*
   57          * Cursor1, return an error -> candidate goes to pass2 list
   58          */
   59         error = hammer_init_cursor(trans, &cursor1, NULL, NULL);
   60         if (error)
   61                 goto done_cursor;
   62         cursor1.key_beg = dedup->elm1;
   63         cursor1.flags |= HAMMER_CURSOR_BACKEND;
   64 
   65         error = hammer_btree_lookup(&cursor1);
   66         if (error)
   67                 goto done_cursor;
   68         error = hammer_btree_extract(&cursor1, HAMMER_CURSOR_GET_LEAF |
   69                                                 HAMMER_CURSOR_GET_DATA);
   70         if (error)
   71                 goto done_cursor;
   72 
   73         /*
   74          * Cursor2, return an error -> candidate goes to pass2 list
   75          */
   76         error = hammer_init_cursor(trans, &cursor2, NULL, NULL);
   77         if (error)
   78                 goto done_cursors;
   79         cursor2.key_beg = dedup->elm2;
   80         cursor2.flags |= HAMMER_CURSOR_BACKEND;
   81 
   82         error = hammer_btree_lookup(&cursor2);
   83         if (error)
   84                 goto done_cursors;
   85         error = hammer_btree_extract(&cursor2, HAMMER_CURSOR_GET_LEAF |
   86                                                 HAMMER_CURSOR_GET_DATA);
   87         if (error)
   88                 goto done_cursors;
   89 
   90         /*
   91          * Zone validation. We can't de-dup any of the other zones
   92          * (BTREE or META) or bad things will happen.
   93          *
   94          * Return with error = 0, but set an INVALID_ZONE flag.
   95          */
   96         error = validate_zone(cursor1.leaf->data_offset) +
   97                             validate_zone(cursor2.leaf->data_offset);
   98         if (error) {
   99                 dedup->head.flags |= HAMMER_IOC_DEDUP_INVALID_ZONE;
  100                 error = 0;
  101                 goto done_cursors;
  102         }
  103 
  104         /*
  105          * Comparison checks
  106          *
  107          * If zones don't match or data_len fields aren't the same
  108          * we consider it to be a comparison failure.
  109          *
  110          * Return with error = 0, but set a CMP_FAILURE flag.
  111          */
  112         if ((cursor1.leaf->data_offset & HAMMER_OFF_ZONE_MASK) !=
  113             (cursor2.leaf->data_offset & HAMMER_OFF_ZONE_MASK)) {
  114                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
  115                 goto done_cursors;
  116         }
  117         if (cursor1.leaf->data_len != cursor2.leaf->data_len) {
  118                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
  119                 goto done_cursors;
  120         }
  121 
  122         /* byte-by-byte comparison to be sure */
  123         if (bcmp(cursor1.data, cursor2.data, cursor1.leaf->data_len)) {
  124                 dedup->head.flags |= HAMMER_IOC_DEDUP_CMP_FAILURE;
  125                 goto done_cursors;
  126         }
  127 
  128         /*
  129          * Upgrade both cursors together to an exclusive lock
  130          *
  131          * Return an error -> candidate goes to pass2 list
  132          */
  133         hammer_sync_lock_sh(trans);
  134         error = hammer_cursor_upgrade2(&cursor1, &cursor2);
  135         if (error) {
  136                 hammer_sync_unlock(trans);
  137                 goto done_cursors;
  138         }
  139 
  140         error = hammer_blockmap_dedup(cursor1.trans,
  141                         cursor1.leaf->data_offset, cursor1.leaf->data_len);
  142         if (error) {
  143                 if (error == ERANGE) {
  144                         /*
  145                          * Return with error = 0, but set an UNDERFLOW flag
  146                          */
  147                         dedup->head.flags |= HAMMER_IOC_DEDUP_UNDERFLOW;
  148                         error = 0;
  149                         goto downgrade_cursors;
  150                 } else {
  151                         /*
  152                          * Return an error -> block goes to pass2 list
  153                          */
  154                         goto downgrade_cursors;
  155                 }
  156         }
  157 
  158         /*
  159          * The cursor2's cache must be invalidated before calling
  160          * hammer_blockmap_free(), otherwise it will not be able to
  161          * invalidate the underlying data buffer.
  162          */
  163         hammer_cursor_invalidate_cache(&cursor2);
  164         hammer_blockmap_free(cursor2.trans,
  165                         cursor2.leaf->data_offset, cursor2.leaf->data_len);
  166 
  167         hammer_modify_node(cursor2.trans, cursor2.node,
  168                         &cursor2.leaf->data_offset, sizeof(hammer_off_t));
  169         cursor2.leaf->data_offset = cursor1.leaf->data_offset;
  170         hammer_modify_node_done(cursor2.node);
  171 
  172 downgrade_cursors:
  173         hammer_cursor_downgrade2(&cursor1, &cursor2);
  174         hammer_sync_unlock(trans);
  175 done_cursors:
  176         hammer_done_cursor(&cursor2);
  177 done_cursor:
  178         hammer_done_cursor(&cursor1);
  179 
  180         /*
  181          * Avoid deadlocking the buffer cache
  182          */
  183         seq = trans->hmp->flusher.done;
  184         while (hammer_flusher_meta_halflimit(trans->hmp) ||
  185                hammer_flusher_undo_exhausted(trans, 2)) {
  186                 hammer_flusher_wait(trans->hmp, seq);
  187                 seq = hammer_flusher_async_one(trans->hmp);
  188         }
  189         return (error);
  190 }
  191 
  192 static __inline int
  193 validate_zone(hammer_off_t data_offset)
  194 {
  195         switch(data_offset & HAMMER_OFF_ZONE_MASK) {
  196         case HAMMER_ZONE_LARGE_DATA:
  197         case HAMMER_ZONE_SMALL_DATA:
  198                 return (0);
  199         default:
  200                 return (1);
  201         }
  202 }
  203 
  204 /************************************************************************
  205  *                              LIVE DEDUP                              *
  206  ************************************************************************
  207  *
  208  * HAMMER Live Dedup (aka as efficient cp(1) implementation)
  209  *
  210  * The dedup cache is operated in a LRU fashion and destroyed on
  211  * unmount, so essentially this is a live dedup on a cached dataset and
  212  * not a full-fledged fs-wide one - we have a batched dedup for that.
  213  * We allow duplicate entries in the buffer cache, data blocks are
  214  * deduplicated only on their way to media.  By default the cache is
  215  * populated on reads only, but it can be populated on writes too.
  216  *
  217  * The main implementation gotcha is on-media requirement - in order for
  218  * a data block to be added to a dedup cache it has to be present on
  219  * disk.  This simplifies cache logic a lot - once data is laid out on
  220  * media it remains valid on media all the way up to the point where the
  221  * related big block the data was stored in is freed - so there is only
  222  * one place we need to bother with invalidation code.
  223  */
  224 
  225 /*
  226  * RB-Tree support for dedup cache structures
  227  */
  228 RB_GENERATE2(hammer_dedup_crc_rb_tree, hammer_dedup_cache, crc_entry,
  229                 hammer_dedup_crc_rb_compare, hammer_crc_t, crc);
  230 RB_GENERATE2(hammer_dedup_off_rb_tree, hammer_dedup_cache, off_entry,
  231                 hammer_dedup_off_rb_compare, hammer_off_t, data_offset);
  232 
  233 struct hammer_dedup_inval {
  234         struct hammer_mount *hmp;
  235         hammer_off_t base_offset;
  236 };
  237 
  238 static int hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data);
  239 static int hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc,
  240                 void *data);
  241 static __inline int _vnode_validate(hammer_dedup_cache_t dcp, void *data,
  242                 int *errorp);
  243 static __inline int _dev_validate(hammer_dedup_cache_t dcp, void *data,
  244                 int *errorp);
  245 
  246 int
  247 hammer_dedup_crc_rb_compare(hammer_dedup_cache_t dc1,
  248                                 hammer_dedup_cache_t dc2)
  249 {
  250         if (dc1->crc < dc2->crc)
  251                 return (-1);
  252         if (dc1->crc > dc2->crc)
  253                 return (1);
  254 
  255         return (0);
  256 }
  257 
  258 int
  259 hammer_dedup_off_rb_compare(hammer_dedup_cache_t dc1,
  260                                 hammer_dedup_cache_t dc2)
  261 {
  262         if (dc1->data_offset < dc2->data_offset)
  263                 return (-1);
  264         if (dc1->data_offset > dc2->data_offset)
  265                 return (1);
  266 
  267         return (0);
  268 }
  269 
  270 static int
  271 hammer_dedup_scancmp(hammer_dedup_cache_t dc, void *data)
  272 {
  273         hammer_off_t off = ((struct hammer_dedup_inval *)data)->base_offset;
  274 
  275         if (dc->data_offset < off)
  276                 return (-1);
  277         if (dc->data_offset >= off + HAMMER_LARGEBLOCK_SIZE)
  278                 return (1);
  279 
  280         return (0);
  281 }
  282 
  283 static int
  284 hammer_dedup_cache_inval_callback(hammer_dedup_cache_t dc, void *data)
  285 {
  286         hammer_mount_t hmp = ((struct hammer_dedup_inval *)data)->hmp;
  287 
  288         RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dc);
  289         RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dc);
  290         TAILQ_REMOVE(&hmp->dedup_lru_list, dc, lru_entry);
  291 
  292         --hmp->dedup_cache_count;
  293         kfree(dc, hmp->m_misc);
  294 
  295         return (0);
  296 }
  297 
  298 hammer_dedup_cache_t
  299 hammer_dedup_cache_add(hammer_inode_t ip, hammer_btree_leaf_elm_t leaf)
  300 {
  301         hammer_dedup_cache_t dcp, tmp;
  302         hammer_mount_t hmp = ip->hmp;
  303 
  304         if (hmp->dedup_free_cache == NULL) {
  305                 tmp = kmalloc(sizeof(*tmp), hmp->m_misc, M_WAITOK | M_ZERO);
  306                 if (hmp->dedup_free_cache == NULL)
  307                         hmp->dedup_free_cache = tmp;
  308                 else
  309                         kfree(tmp, hmp->m_misc);
  310         }
  311 
  312         KKASSERT(leaf != NULL);
  313 
  314         dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
  315                                 &hmp->rb_dedup_crc_root, leaf->data_crc);
  316         if (dcp != NULL) {
  317                 RB_REMOVE(hammer_dedup_off_rb_tree,
  318                                 &hmp->rb_dedup_off_root, dcp);
  319                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
  320                 goto populate;
  321         }
  322 
  323         if (hmp->dedup_cache_count < hammer_live_dedup_cache_size) {
  324                 dcp = hmp->dedup_free_cache;
  325                 hmp->dedup_free_cache = NULL;
  326                 ++hmp->dedup_cache_count;
  327         } else {
  328                 dcp = TAILQ_FIRST(&hmp->dedup_lru_list);
  329                 RB_REMOVE(hammer_dedup_crc_rb_tree,
  330                                 &hmp->rb_dedup_crc_root, dcp);
  331                 RB_REMOVE(hammer_dedup_off_rb_tree,
  332                                 &hmp->rb_dedup_off_root, dcp);
  333                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
  334         }
  335 
  336         dcp->crc = leaf->data_crc;
  337         tmp = RB_INSERT(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
  338         KKASSERT(tmp == NULL);
  339 
  340 populate:
  341         dcp->hmp = ip->hmp;
  342         dcp->obj_id = ip->obj_id;
  343         dcp->localization = ip->obj_localization;
  344         dcp->file_offset = leaf->base.key - leaf->data_len;
  345         dcp->bytes = leaf->data_len;
  346         dcp->data_offset = leaf->data_offset;
  347 
  348         tmp = RB_INSERT(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
  349         KKASSERT(tmp == NULL);
  350         TAILQ_INSERT_TAIL(&hmp->dedup_lru_list, dcp, lru_entry);
  351 
  352         return (dcp);
  353 }
  354 
  355 __inline hammer_dedup_cache_t
  356 hammer_dedup_cache_lookup(hammer_mount_t hmp, hammer_crc_t crc)
  357 {
  358         hammer_dedup_cache_t dcp;
  359 
  360         dcp = RB_LOOKUP(hammer_dedup_crc_rb_tree,
  361                                 &hmp->rb_dedup_crc_root, crc);
  362         return dcp;
  363 }
  364 
  365 void hammer_dedup_cache_inval(hammer_mount_t hmp, hammer_off_t base_offset)
  366 {
  367         struct hammer_dedup_inval di;
  368 
  369         di.hmp = hmp;
  370         di.base_offset = base_offset;
  371 
  372         RB_SCAN(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root,
  373                 hammer_dedup_scancmp, hammer_dedup_cache_inval_callback, &di);
  374 }
  375 
  376 void
  377 hammer_destroy_dedup_cache(hammer_mount_t hmp)
  378 {
  379         hammer_dedup_cache_t dcp;
  380 
  381         while ((dcp = TAILQ_FIRST(&hmp->dedup_lru_list)) != NULL) {
  382                 RB_REMOVE(hammer_dedup_crc_rb_tree, &hmp->rb_dedup_crc_root, dcp);
  383                 RB_REMOVE(hammer_dedup_off_rb_tree, &hmp->rb_dedup_off_root, dcp);
  384                 TAILQ_REMOVE(&hmp->dedup_lru_list, dcp, lru_entry);
  385                 --hmp->dedup_cache_count;
  386                 kfree(dcp, hmp->m_misc);
  387         }
  388 
  389         KKASSERT(RB_EMPTY(&hmp->rb_dedup_crc_root));
  390         KKASSERT(RB_EMPTY(&hmp->rb_dedup_off_root));
  391         KKASSERT(TAILQ_EMPTY(&hmp->dedup_lru_list));
  392 
  393         KKASSERT(hmp->dedup_cache_count == 0);
  394 }
  395 
  396 int
  397 hammer_dedup_validate(hammer_dedup_cache_t dcp, int zone, int bytes,
  398                       void *data)
  399 {
  400         int error;
  401 
  402         /*
  403          * Zone validation
  404          */
  405         if (HAMMER_ZONE_DECODE(dcp->data_offset) != zone)
  406                 return (0);
  407 
  408         /*
  409          * Block length validation
  410          */
  411         if (dcp->bytes != bytes)
  412                 return (0);
  413 
  414         /*
  415          * Byte-by-byte data comparison
  416          *
  417          * The data we need for validation may already be present in the
  418          * buffer cache in two flavours: vnode-based buffer or
  419          * block-device-based buffer.  In case vnode-based buffer wasn't
  420          * there or if a non-blocking attempt to acquire it failed use
  421          * device-based buffer (for large-zone data blocks it will
  422          * generate a separate read).
  423          *
  424          * XXX vnode-based checking is not MP safe so when live-dedup
  425          *     is turned on we must always use the device buffer.
  426          */
  427 #if 0
  428         if (hammer_double_buffer) {
  429                 error = 1;
  430         } else if (_vnode_validate(dcp, data, &error)) {
  431                 hammer_live_dedup_vnode_bcmps++;
  432                 return (1);
  433         } else {
  434                 if (error == 3)
  435                         hammer_live_dedup_findblk_failures++;
  436         }
  437 
  438         /*
  439          * If there was an error previously or if double buffering is
  440          * enabled.
  441          */
  442         if (error) {
  443                 if (_dev_validate(dcp, data, &error)) {
  444                         hammer_live_dedup_device_bcmps++;
  445                         return (1);
  446                 }
  447         }
  448 #endif
  449         if (_dev_validate(dcp, data, &error)) {
  450                 hammer_live_dedup_device_bcmps++;
  451                 return (1);
  452         }
  453 
  454         return (0);
  455 }
  456 
  457 static __inline int
  458 _vnode_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
  459 {
  460         struct hammer_transaction trans;
  461         hammer_inode_t ip;
  462         struct vnode *vp;
  463         struct buf *bp;
  464         off_t dooffset;
  465         int result, error;
  466 
  467         result = error = 0;
  468         *errorp = 0;
  469 
  470         hammer_simple_transaction(&trans, dcp->hmp);
  471 
  472         ip = hammer_get_inode(&trans, NULL, dcp->obj_id, HAMMER_MAX_TID,
  473             dcp->localization, 0, &error);
  474         if (ip == NULL) {
  475                 kprintf("dedup: unable to find objid %016jx:%08x\n",
  476                     (intmax_t)dcp->obj_id, dcp->localization);
  477                 *errorp = 1;
  478                 goto failed2;
  479         }
  480 
  481         error = hammer_get_vnode(ip, &vp);
  482         if (error) {
  483                 kprintf("dedup: unable to acquire vnode for %016jx:%08x\n",
  484                     (intmax_t)dcp->obj_id, dcp->localization);
  485                 *errorp = 2;
  486                 goto failed;
  487         }
  488 
  489         if ((bp = findblk(ip->vp, dcp->file_offset, FINDBLK_NBLOCK)) != NULL) {
  490                 bremfree(bp);
  491 
  492                 /* XXX if (mapped to userspace) goto done, *errorp = 4 */
  493 
  494                 if ((bp->b_flags & B_CACHE) == 0 || bp->b_flags & B_DIRTY) {
  495                         *errorp = 5;
  496                         goto done;
  497                 }
  498 
  499                 if (bp->b_bio2.bio_offset != dcp->data_offset) {
  500                         error = VOP_BMAP(ip->vp, dcp->file_offset, &dooffset,
  501                             NULL, NULL, BUF_CMD_READ);
  502                         if (error) {
  503                                 *errorp = 6;
  504                                 goto done;
  505                         }
  506 
  507                         if (dooffset != dcp->data_offset) {
  508                                 *errorp = 7;
  509                                 goto done;
  510                         }
  511                         hammer_live_dedup_bmap_saves++;
  512                 }
  513 
  514                 if (bcmp(data, bp->b_data, dcp->bytes) == 0)
  515                         result = 1;
  516 
  517 done:
  518                 bqrelse(bp);
  519         } else {
  520                 *errorp = 3;
  521         }
  522         vput(vp);
  523 
  524 failed:
  525         hammer_rel_inode(ip, 0);
  526 failed2:
  527         hammer_done_transaction(&trans);
  528         return (result);
  529 }
  530 
  531 static __inline int
  532 _dev_validate(hammer_dedup_cache_t dcp, void *data, int *errorp)
  533 {
  534         hammer_buffer_t buffer = NULL;
  535         void *ondisk_data;
  536         int result, error;
  537 
  538         result = error = 0;
  539         *errorp = 0;
  540 
  541         ondisk_data = hammer_bread_ext(dcp->hmp, dcp->data_offset,
  542             dcp->bytes, &error, &buffer);
  543         if (error) {
  544                 *errorp = 1;
  545                 goto failed;
  546         }
  547 
  548         if (bcmp(data, ondisk_data, dcp->bytes) == 0)
  549                 result = 1;
  550 
  551 failed:
  552         if (buffer)
  553                 hammer_rel_buffer(buffer, 0);
  554 
  555         return (result);
  556 }

Cache object: b6369616524e6c75d992e2251cd225da


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.