The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vfs/hammer/hammer_redo.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
    3  *
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Matthew Dillon <dillon@backplane.com>
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  *
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in
   15  *    the documentation and/or other materials provided with the
   16  *    distribution.
   17  * 3. Neither the name of The DragonFly Project nor the names of its
   18  *    contributors may be used to endorse or promote products derived
   19  *    from this software without specific, prior written permission.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  */
   34 
   35 /*
   36  * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
   37  *
   38  * See also hammer_undo.c
   39  */
   40 
   41 #include "hammer.h"
   42 
   43 RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
   44              hammer_redo_rb_compare, hammer_off_t, redo_fifo_start);
   45 
   46 /*
   47  * HAMMER version 4+ REDO support.
   48  *
   49  * REDO records are used to improve fsync() performance.  Instead of having
   50  * to go through a complete double-flush cycle involving at least two disk
   51  * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
   52  * the related REDO records, which is a single synchronization requiring
   53  * no track seeking.  If a recovery becomes necessary the recovery code
   54  * will generate logical data writes based on the REDO records encountered.
   55  * That is, the recovery code will UNDO any partial meta-data/data writes
   56  * at the raw disk block level and then REDO the data writes at the logical
   57  * level.
   58  */
   59 int
   60 hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
   61                      hammer_off_t file_off, u_int32_t flags,
   62                      void *base, int len)
   63 {
   64         hammer_mount_t hmp;
   65         hammer_volume_t root_volume;
   66         hammer_blockmap_t undomap;
   67         hammer_buffer_t buffer = NULL;
   68         hammer_fifo_redo_t redo;
   69         hammer_fifo_tail_t tail;
   70         hammer_off_t next_offset;
   71         int error;
   72         int bytes;
   73         int n;
   74 
   75         /*
   76          * Setup
   77          */
   78         hmp = trans->hmp;
   79 
   80         root_volume = trans->rootvol;
   81         undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
   82 
   83         /*
   84          * No undo recursion when modifying the root volume
   85          */
   86         hammer_modify_volume(NULL, root_volume, NULL, 0);
   87         hammer_lock_ex(&hmp->undo_lock);
   88 
   89         /* undo had better not roll over (loose test) */
   90         if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
   91                 panic("hammer: insufficient undo FIFO space!");
   92 
   93         /*
   94          * Loop until the undo for the entire range has been laid down.
   95          * Loop at least once (len might be 0 as a degenerate case).
   96          */
   97         for (;;) {
   98                 /*
   99                  * Fetch the layout offset in the UNDO FIFO, wrap it as
  100                  * necessary.
  101                  */
  102                 if (undomap->next_offset == undomap->alloc_offset) {
  103                         undomap->next_offset =
  104                                 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
  105                 }
  106                 next_offset = undomap->next_offset;
  107 
  108                 /*
  109                  * This is a tail-chasing FIFO, when we hit the start of a new
  110                  * buffer we don't have to read it in.
  111                  */
  112                 if ((next_offset & HAMMER_BUFMASK) == 0) {
  113                         redo = hammer_bnew(hmp, next_offset, &error, &buffer);
  114                         hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000);
  115                 } else {
  116                         redo = hammer_bread(hmp, next_offset, &error, &buffer);
  117                 }
  118                 if (error)
  119                         break;
  120                 hammer_modify_buffer(NULL, buffer, NULL, 0);
  121 
  122                 /*
  123                  * Calculate how big a media structure fits up to the next
  124                  * alignment point and how large a data payload we can
  125                  * accomodate.
  126                  *
  127                  * If n calculates to 0 or negative there is no room for
  128                  * anything but a PAD.
  129                  */
  130                 bytes = HAMMER_UNDO_ALIGN -
  131                         ((int)next_offset & HAMMER_UNDO_MASK);
  132                 n = bytes -
  133                     (int)sizeof(struct hammer_fifo_redo) -
  134                     (int)sizeof(struct hammer_fifo_tail);
  135 
  136                 /*
  137                  * If available space is insufficient for any payload
  138                  * we have to lay down a PAD.
  139                  *
  140                  * The minimum PAD is 8 bytes and the head and tail will
  141                  * overlap each other in that case.  PADs do not have
  142                  * sequence numbers or CRCs.
  143                  *
  144                  * A PAD may not start on a boundary.  That is, every
  145                  * 512-byte block in the UNDO/REDO FIFO must begin with
  146                  * a record containing a sequence number.
  147                  */
  148                 if (n <= 0) {
  149                         KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
  150                         KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
  151                         tail = (void *)((char *)redo + bytes - sizeof(*tail));
  152                         if ((void *)redo != (void *)tail) {
  153                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
  154                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
  155                                 tail->tail_size = bytes;
  156                         }
  157                         redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
  158                         redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
  159                         redo->head.hdr_size = bytes;
  160                         /* NO CRC OR SEQ NO */
  161                         undomap->next_offset += bytes;
  162                         hammer_modify_buffer_done(buffer);
  163                         hammer_stats_redo += bytes;
  164                         continue;
  165                 }
  166 
  167                 /*
  168                  * When generating an inode-related REDO record we track
  169                  * the point in the UNDO/REDO FIFO containing the inode's
  170                  * earliest REDO record.  See hammer_generate_redo_sync().
  171                  *
  172                  * redo_fifo_next is cleared when an inode is staged to
  173                  * the backend and then used to determine how to reassign
  174                  * redo_fifo_start after the inode flush completes.
  175                  */
  176                 if (ip) {
  177                         redo->redo_objid = ip->obj_id;
  178                         redo->redo_localization = ip->obj_localization;
  179                         if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
  180                                 ip->redo_fifo_start = next_offset;
  181                                 if (RB_INSERT(hammer_redo_rb_tree,
  182                                               &hmp->rb_redo_root, ip)) {
  183                                         panic("hammer_generate_redo: "
  184                                               "cannot insert inode %p on "
  185                                               "redo FIFO", ip);
  186                                 }
  187                                 ip->flags |= HAMMER_INODE_RDIRTY;
  188                         }
  189                         if (ip->redo_fifo_next == 0)
  190                                 ip->redo_fifo_next = next_offset;
  191                 } else {
  192                         redo->redo_objid = 0;
  193                         redo->redo_localization = 0;
  194                 }
  195 
  196                 /*
  197                  * Calculate the actual payload and recalculate the size
  198                  * of the media structure as necessary.  If no data buffer
  199                  * is supplied there is no payload.
  200                  */
  201                 if (base == NULL) {
  202                         n = 0;
  203                 } else if (n > len) {
  204                         n = len;
  205                 }
  206                 bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
  207                          ~HAMMER_HEAD_ALIGN_MASK) +
  208                         (int)sizeof(struct hammer_fifo_redo) +
  209                         (int)sizeof(struct hammer_fifo_tail);
  210                 if (hammer_debug_general & 0x0080) {
  211                         kprintf("redo %016llx %d %d\n",
  212                                 (long long)next_offset, bytes, n);
  213                 }
  214 
  215                 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
  216                 redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
  217                 redo->head.hdr_size = bytes;
  218                 redo->head.hdr_seq = hmp->undo_seqno++;
  219                 redo->head.hdr_crc = 0;
  220                 redo->redo_mtime = trans->time;
  221                 redo->redo_offset = file_off;
  222                 redo->redo_flags = flags;
  223 
  224                 /*
  225                  * Incremental payload.  If no payload we throw the entire
  226                  * len into redo_data_bytes and will not loop.
  227                  */
  228                 if (base) {
  229                         redo->redo_data_bytes = n;
  230                         bcopy(base, redo + 1, n);
  231                         len -= n;
  232                         base = (char *)base + n;
  233                         file_off += n;
  234                 } else {
  235                         redo->redo_data_bytes = len;
  236                         file_off += len;
  237                         len = 0;
  238                 }
  239 
  240                 tail = (void *)((char *)redo + bytes - sizeof(*tail));
  241                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
  242                 tail->tail_type = HAMMER_HEAD_TYPE_REDO;
  243                 tail->tail_size = bytes;
  244 
  245                 KKASSERT(bytes >= sizeof(redo->head));
  246                 redo->head.hdr_crc = crc32(redo, HAMMER_FIFO_HEAD_CRCOFF) ^
  247                              crc32(&redo->head + 1, bytes - sizeof(redo->head));
  248                 undomap->next_offset += bytes;
  249                 hammer_stats_redo += bytes;
  250 
  251                 /*
  252                  * Before we finish off the buffer we have to deal with any
  253                  * junk between the end of the media structure we just laid
  254                  * down and the UNDO alignment boundary.  We do this by laying
  255                  * down a dummy PAD.  Even though we will probably overwrite
  256                  * it almost immediately we have to do this so recovery runs
  257                  * can iterate the UNDO space without having to depend on
  258                  * the indices in the volume header.
  259                  *
  260                  * This dummy PAD will be overwritten on the next undo so
  261                  * we do not adjust undomap->next_offset.
  262                  */
  263                 bytes = HAMMER_UNDO_ALIGN -
  264                         ((int)undomap->next_offset & HAMMER_UNDO_MASK);
  265                 if (bytes != HAMMER_UNDO_ALIGN) {
  266                         KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
  267                         redo = (void *)(tail + 1);
  268                         tail = (void *)((char *)redo + bytes - sizeof(*tail));
  269                         if ((void *)redo != (void *)tail) {
  270                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
  271                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
  272                                 tail->tail_size = bytes;
  273                         }
  274                         redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
  275                         redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
  276                         redo->head.hdr_size = bytes;
  277                         /* NO CRC OR SEQ NO */
  278                 }
  279                 hammer_modify_buffer_done(buffer);
  280                 if (len == 0)
  281                         break;
  282         }
  283         hammer_modify_volume_done(root_volume);
  284         hammer_unlock(&hmp->undo_lock);
  285 
  286         if (buffer)
  287                 hammer_rel_buffer(buffer, 0);
  288 
  289         /*
  290          * Make sure the nominal undo span contains at least one REDO_SYNC,
  291          * otherwise the REDO recovery will not be triggered.
  292          */
  293         if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
  294             flags != HAMMER_REDO_SYNC) {
  295                 hammer_generate_redo_sync(trans);
  296         }
  297 
  298         return(error);
  299 }
  300 
  301 /*
  302  * Generate a REDO SYNC record.  At least one such record must be generated
  303  * in the nominal recovery span for the recovery code to be able to run
  304  * REDOs outside of the span.
  305  *
  306  * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
  307  * for all inodes with active REDOs.  This changes dynamically as inodes
  308  * get flushed.
  309  *
  310  * During recovery stage2 any new flush cycles must specify the original
  311  * redo sync offset.  That way a crash will re-run the REDOs, at least
  312  * up to the point where the UNDO FIFO does not overwrite the area.
  313  */
  314 void
  315 hammer_generate_redo_sync(hammer_transaction_t trans)
  316 {
  317         hammer_mount_t hmp = trans->hmp;
  318         hammer_inode_t ip;
  319         hammer_off_t redo_fifo_start;
  320 
  321         if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) {
  322                 ip = NULL;
  323                 redo_fifo_start = hmp->recover_stage2_offset;
  324         } else {
  325                 ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
  326                 if (ip)
  327                         redo_fifo_start = ip->redo_fifo_start;
  328                 else
  329                         redo_fifo_start = 0;
  330         }
  331         if (redo_fifo_start) {
  332                 if (hammer_debug_io & 0x0004) {
  333                         kprintf("SYNC IP %p %016jx\n",
  334                                 ip, (intmax_t)redo_fifo_start);
  335                 }
  336                 hammer_generate_redo(trans, NULL, redo_fifo_start,
  337                                      HAMMER_REDO_SYNC, NULL, 0);
  338                 trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
  339         }
  340 }
  341 
  342 /*
  343  * This is called when an inode is queued to the backend.
  344  */
  345 void
  346 hammer_redo_fifo_start_flush(hammer_inode_t ip)
  347 {
  348         ip->redo_fifo_next = 0;
  349 }
  350 
  351 /*
  352  * This is called when an inode backend flush is finished.  We have to make
  353  * sure that RDIRTY is not set unless dirty bufs are present.  Dirty bufs
  354  * can get destroyed through operations such as truncations and leave
  355  * us with a stale redo_fifo_next.
  356  */
  357 void
  358 hammer_redo_fifo_end_flush(hammer_inode_t ip)
  359 {
  360         hammer_mount_t hmp = ip->hmp;
  361 
  362         if (ip->flags & HAMMER_INODE_RDIRTY) {
  363                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
  364                 ip->flags &= ~HAMMER_INODE_RDIRTY;
  365         }
  366         if ((ip->flags & HAMMER_INODE_BUFS) == 0)
  367                 ip->redo_fifo_next = 0;
  368         if (ip->redo_fifo_next) {
  369                 ip->redo_fifo_start = ip->redo_fifo_next;
  370                 if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) {
  371                         panic("hammer_generate_redo: cannot reinsert "
  372                               "inode %p on redo FIFO",
  373                               ip);
  374                 }
  375                 ip->flags |= HAMMER_INODE_RDIRTY;
  376         }
  377 }

Cache object: f98f10c010e62220b18079eec71f5da5


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.