The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/geom/journal/g_journal.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD$");
   29 
   30 #include <sys/param.h>
   31 #include <sys/systm.h>
   32 #include <sys/kernel.h>
   33 #include <sys/module.h>
   34 #include <sys/limits.h>
   35 #include <sys/lock.h>
   36 #include <sys/mutex.h>
   37 #include <sys/bio.h>
   38 #include <sys/sysctl.h>
   39 #include <sys/malloc.h>
   40 #include <sys/mount.h>
   41 #include <sys/eventhandler.h>
   42 #include <sys/proc.h>
   43 #include <sys/kthread.h>
   44 #include <sys/sched.h>
   45 #include <sys/taskqueue.h>
   46 #include <sys/vnode.h>
   47 #include <sys/sbuf.h>
   48 #ifdef GJ_MEMDEBUG
   49 #include <sys/stack.h>
   50 #include <sys/kdb.h>
   51 #endif
   52 #include <vm/vm.h>
   53 #include <vm/vm_kern.h>
   54 #include <geom/geom.h>
   55 
   56 #include <geom/journal/g_journal.h>
   57 
   58 
   59 /*
   60  * On-disk journal format:
   61  *
   62  * JH - Journal header
   63  * RH - Record header
   64  *
   65  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
   66  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
   67  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
   68  *
   69  */
   70 
   71 CTASSERT(sizeof(struct g_journal_header) <= 512);
   72 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
   73 
   74 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
   75 static struct mtx g_journal_cache_mtx;
   76 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
   77 
   78 const struct g_journal_desc *g_journal_filesystems[] = {
   79         &g_journal_ufs,
   80         NULL
   81 };
   82 
   83 SYSCTL_DECL(_kern_geom);
   84 
   85 int g_journal_debug = 0;
   86 TUNABLE_INT("kern.geom.journal.debug", &g_journal_debug);
   87 static u_int g_journal_switch_time = 10;
   88 static u_int g_journal_force_switch = 70;
   89 static u_int g_journal_parallel_flushes = 16;
   90 static u_int g_journal_parallel_copies = 16;
   91 static u_int g_journal_accept_immediately = 64;
   92 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
   93 static u_int g_journal_do_optimize = 1;
   94 
   95 SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, "GEOM_JOURNAL stuff");
   96 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RW, &g_journal_debug, 0,
   97     "Debug level");
   98 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
   99     &g_journal_switch_time, 0, "Switch journals every N seconds");
  100 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
  101     &g_journal_force_switch, 0, "Force switch when journal is N% full");
  102 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
  103     &g_journal_parallel_flushes, 0,
  104     "Number of flush I/O requests to send in parallel");
  105 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
  106     &g_journal_accept_immediately, 0,
  107     "Number of I/O requests accepted immediately");
  108 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
  109     &g_journal_parallel_copies, 0,
  110     "Number of copy I/O requests to send in parallel");
  111 static int
  112 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
  113 {
  114         u_int entries;
  115         int error;
  116 
  117         entries = g_journal_record_entries;
  118         error = sysctl_handle_int(oidp, &entries, 0, req);
  119         if (error != 0 || req->newptr == NULL)
  120                 return (error);
  121         if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
  122                 return (EINVAL);
  123         g_journal_record_entries = entries;
  124         return (0);
  125 }
  126 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
  127     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
  128     "Maximum number of entires in one journal record");
  129 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
  130     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
  131 
  132 static u_int g_journal_cache_used = 0;
  133 static u_int g_journal_cache_limit = 64 * 1024 * 1024;
  134 TUNABLE_INT("kern.geom.journal.cache.limit", &g_journal_cache_limit);
  135 static u_int g_journal_cache_divisor = 2;
  136 TUNABLE_INT("kern.geom.journal.cache.divisor", &g_journal_cache_divisor);
  137 static u_int g_journal_cache_switch = 90;
  138 static u_int g_journal_cache_misses = 0;
  139 static u_int g_journal_cache_alloc_failures = 0;
  140 static u_int g_journal_cache_low = 0;
  141 
  142 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
  143     "GEOM_JOURNAL cache");
  144 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
  145     &g_journal_cache_used, 0, "Number of allocated bytes");
  146 static int
  147 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
  148 {
  149         u_int limit;
  150         int error;
  151 
  152         limit = g_journal_cache_limit;
  153         error = sysctl_handle_int(oidp, &limit, 0, req);
  154         if (error != 0 || req->newptr == NULL)
  155                 return (error);
  156         g_journal_cache_limit = limit;
  157         g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
  158         return (0);
  159 }
  160 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
  161     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_limit_sysctl, "I",
  162     "Maximum number of allocated bytes");
  163 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
  164     &g_journal_cache_divisor, 0,
  165     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
  166 static int
  167 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
  168 {
  169         u_int cswitch;
  170         int error;
  171 
  172         cswitch = g_journal_cache_switch;
  173         error = sysctl_handle_int(oidp, &cswitch, 0, req);
  174         if (error != 0 || req->newptr == NULL)
  175                 return (error);
  176         if (cswitch < 0 || cswitch > 100)
  177                 return (EINVAL);
  178         g_journal_cache_switch = cswitch;
  179         g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
  180         return (0);
  181 }
  182 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
  183     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
  184     "Force switch when we hit this percent of cache use");
  185 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
  186     &g_journal_cache_misses, 0, "Number of cache misses");
  187 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
  188     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
  189 
  190 static u_long g_journal_stats_bytes_skipped = 0;
  191 static u_long g_journal_stats_combined_ios = 0;
  192 static u_long g_journal_stats_switches = 0;
  193 static u_long g_journal_stats_wait_for_copy = 0;
  194 static u_long g_journal_stats_journal_full = 0;
  195 static u_long g_journal_stats_low_mem = 0;
  196 
  197 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
  198     "GEOM_JOURNAL statistics");
  199 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
  200     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
  201 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
  202     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
  203 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
  204     &g_journal_stats_switches, 0, "Number of journal switches");
  205 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
  206     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
  207 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
  208     &g_journal_stats_journal_full, 0,
  209     "Number of times journal was almost full.");
  210 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
  211     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
  212 
  213 static g_taste_t g_journal_taste;
  214 static g_ctl_req_t g_journal_config;
  215 static g_dumpconf_t g_journal_dumpconf;
  216 static g_init_t g_journal_init;
  217 static g_fini_t g_journal_fini;
  218 
  219 struct g_class g_journal_class = {
  220         .name = G_JOURNAL_CLASS_NAME,
  221         .version = G_VERSION,
  222         .taste = g_journal_taste,
  223         .ctlreq = g_journal_config,
  224         .dumpconf = g_journal_dumpconf,
  225         .init = g_journal_init,
  226         .fini = g_journal_fini
  227 };
  228 
  229 static int g_journal_destroy(struct g_journal_softc *sc);
  230 static void g_journal_metadata_update(struct g_journal_softc *sc);
  231 static void g_journal_switch_wait(struct g_journal_softc *sc);
  232 
  233 #define GJ_SWITCHER_WORKING     0
  234 #define GJ_SWITCHER_DIE         1
  235 #define GJ_SWITCHER_DIED        2
  236 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
  237 static int g_journal_switcher_wokenup = 0;
  238 static int g_journal_sync_requested = 0;
  239 
  240 #ifdef GJ_MEMDEBUG
  241 struct meminfo {
  242         size_t          mi_size;
  243         struct stack    mi_stack;
  244 };
  245 #endif
  246 
  247 /*
  248  * We use our own malloc/realloc/free funtions, so we can collect statistics
  249  * and force journal switch when we're running out of cache.
  250  */
  251 static void *
  252 gj_malloc(size_t size, int flags)
  253 {
  254         void *p;
  255 #ifdef GJ_MEMDEBUG
  256         struct meminfo *mi;
  257 #endif
  258 
  259         mtx_lock(&g_journal_cache_mtx);
  260         if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
  261             g_journal_cache_used + size > g_journal_cache_low) {
  262                 GJ_DEBUG(1, "No cache, waking up the switcher.");
  263                 g_journal_switcher_wokenup = 1;
  264                 wakeup(&g_journal_switcher_state);
  265         }
  266         if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
  267             g_journal_cache_used + size > g_journal_cache_limit) {
  268                 mtx_unlock(&g_journal_cache_mtx);
  269                 g_journal_cache_alloc_failures++;
  270                 return (NULL);
  271         }
  272         g_journal_cache_used += size;
  273         mtx_unlock(&g_journal_cache_mtx);
  274         flags &= ~M_NOWAIT;
  275 #ifndef GJ_MEMDEBUG
  276         p = malloc(size, M_JOURNAL, flags | M_WAITOK);
  277 #else
  278         mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
  279         p = (u_char *)mi + sizeof(*mi);
  280         mi->mi_size = size;
  281         stack_save(&mi->mi_stack);
  282 #endif
  283         return (p);
  284 }
  285 
  286 static void
  287 gj_free(void *p, size_t size)
  288 {
  289 #ifdef GJ_MEMDEBUG
  290         struct meminfo *mi;
  291 #endif
  292 
  293         KASSERT(p != NULL, ("p=NULL"));
  294         KASSERT(size > 0, ("size=0"));
  295         mtx_lock(&g_journal_cache_mtx);
  296         KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
  297         g_journal_cache_used -= size;
  298         mtx_unlock(&g_journal_cache_mtx);
  299 #ifdef GJ_MEMDEBUG
  300         mi = p = (void *)((u_char *)p - sizeof(*mi));
  301         if (mi->mi_size != size) {
  302                 printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
  303                     mi->mi_size);
  304                 printf("GJOURNAL: Alloc backtrace:\n");
  305                 stack_print(&mi->mi_stack);
  306                 printf("GJOURNAL: Free backtrace:\n");
  307                 kdb_backtrace();
  308         }
  309 #endif
  310         free(p, M_JOURNAL);
  311 }
  312 
  313 static void *
  314 gj_realloc(void *p, size_t size, size_t oldsize)
  315 {
  316         void *np;
  317 
  318 #ifndef GJ_MEMDEBUG
  319         mtx_lock(&g_journal_cache_mtx);
  320         g_journal_cache_used -= oldsize;
  321         g_journal_cache_used += size;
  322         mtx_unlock(&g_journal_cache_mtx);
  323         np = realloc(p, size, M_JOURNAL, M_WAITOK);
  324 #else
  325         np = gj_malloc(size, M_WAITOK);
  326         bcopy(p, np, MIN(oldsize, size));
  327         gj_free(p, oldsize);
  328 #endif
  329         return (np);
  330 }
  331 
  332 static void
  333 g_journal_check_overflow(struct g_journal_softc *sc)
  334 {
  335         off_t length, used;
  336 
  337         if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
  338              sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
  339             (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
  340              sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
  341              sc->sc_journal_offset < sc->sc_active.jj_offset)) {
  342                 panic("Journal overflow (joffset=%jd active=%jd inactive=%jd)",
  343                     (intmax_t)sc->sc_journal_offset,
  344                     (intmax_t)sc->sc_active.jj_offset,
  345                     (intmax_t)sc->sc_inactive.jj_offset);
  346         }
  347         if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
  348                 length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
  349                 used = sc->sc_journal_offset - sc->sc_active.jj_offset;
  350         } else {
  351                 length = sc->sc_jend - sc->sc_active.jj_offset;
  352                 length += sc->sc_inactive.jj_offset - sc->sc_jstart;
  353                 if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
  354                         used = sc->sc_journal_offset - sc->sc_active.jj_offset;
  355                 else {
  356                         used = sc->sc_jend - sc->sc_active.jj_offset;
  357                         used += sc->sc_journal_offset - sc->sc_jstart;
  358                 }
  359         }
  360         /* Already woken up? */
  361         if (g_journal_switcher_wokenup)
  362                 return;
  363         /*
  364          * If the active journal takes more than g_journal_force_switch precent
  365          * of free journal space, we force journal switch.
  366          */
  367         KASSERT(length > 0,
  368             ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
  369             (intmax_t)length, (intmax_t)used,
  370             (intmax_t)sc->sc_active.jj_offset,
  371             (intmax_t)sc->sc_inactive.jj_offset,
  372             (intmax_t)sc->sc_journal_offset));
  373         if ((used * 100) / length > g_journal_force_switch) {
  374                 g_journal_stats_journal_full++;
  375                 GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
  376                     sc->sc_name, (used * 100) / length);
  377                 mtx_lock(&g_journal_cache_mtx);
  378                 g_journal_switcher_wokenup = 1;
  379                 wakeup(&g_journal_switcher_state);
  380                 mtx_unlock(&g_journal_cache_mtx);
  381         }
  382 }
  383 
  384 static void
  385 g_journal_orphan(struct g_consumer *cp)
  386 {
  387         struct g_journal_softc *sc;
  388         char name[256];
  389         int error;
  390 
  391         g_topology_assert();
  392         sc = cp->geom->softc;
  393         strlcpy(name, cp->provider->name, sizeof(name));
  394         GJ_DEBUG(0, "Lost provider %s.", name);
  395         if (sc == NULL)
  396                 return;
  397         error = g_journal_destroy(sc);
  398         if (error == 0)
  399                 GJ_DEBUG(0, "Journal %s destroyed.", name);
  400         else {
  401                 GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
  402                     "Destroy it manually after last close.", sc->sc_name,
  403                     error);
  404         }
  405 }
  406 
  407 static int
  408 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
  409 {
  410         struct g_journal_softc *sc;
  411         int dcr, dcw, dce;
  412 
  413         g_topology_assert();
  414         GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
  415             acr, acw, ace);
  416 
  417         dcr = pp->acr + acr;
  418         dcw = pp->acw + acw;
  419         dce = pp->ace + ace;
  420 
  421         sc = pp->geom->softc;
  422         if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
  423                 if (acr <= 0 && acw <= 0 && ace <= 0)
  424                         return (0);
  425                 else
  426                         return (ENXIO);
  427         }
  428         if (pp->acw == 0 && dcw > 0) {
  429                 GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
  430                 sc->sc_flags &= ~GJF_DEVICE_CLEAN;
  431                 g_topology_unlock();
  432                 g_journal_metadata_update(sc);
  433                 g_topology_lock();
  434         } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
  435                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
  436                 sc->sc_flags |= GJF_DEVICE_CLEAN;
  437                 g_topology_unlock();
  438                 g_journal_metadata_update(sc);
  439                 g_topology_lock();
  440         } */
  441         return (0);
  442 }
  443 
  444 static void
  445 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
  446 {
  447 
  448         bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
  449         data += sizeof(GJ_HEADER_MAGIC);
  450         le32enc(data, hdr->jh_journal_id);
  451         data += 4;
  452         le32enc(data, hdr->jh_journal_next_id);
  453 }
  454 
  455 static int
  456 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
  457 {
  458 
  459         bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
  460         data += sizeof(hdr->jh_magic);
  461         if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
  462                 return (EINVAL);
  463         hdr->jh_journal_id = le32dec(data);
  464         data += 4;
  465         hdr->jh_journal_next_id = le32dec(data);
  466         return (0);
  467 }
  468 
  469 static void
  470 g_journal_flush_cache(struct g_journal_softc *sc)
  471 {
  472         struct bintime bt;
  473         int error;
  474 
  475         if (sc->sc_bio_flush == 0)
  476                 return;
  477         GJ_TIMER_START(1, &bt);
  478         if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
  479                 error = g_io_flush(sc->sc_jconsumer);
  480                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
  481                     sc->sc_jconsumer->provider->name, error);
  482         }
  483         if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
  484                 /*
  485                  * TODO: This could be called in parallel with the
  486                  *       previous call.
  487                  */
  488                 error = g_io_flush(sc->sc_dconsumer);
  489                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
  490                     sc->sc_dconsumer->provider->name, error);
  491         }
  492         GJ_TIMER_STOP(1, &bt, "Cache flush time");
  493 }
  494 
  495 static int
  496 g_journal_write_header(struct g_journal_softc *sc)
  497 {
  498         struct g_journal_header hdr;
  499         struct g_consumer *cp;
  500         u_char *buf;
  501         int error;
  502 
  503         cp = sc->sc_jconsumer;
  504         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
  505 
  506         strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
  507         hdr.jh_journal_id = sc->sc_journal_id;
  508         hdr.jh_journal_next_id = sc->sc_journal_next_id;
  509         g_journal_header_encode(&hdr, buf);
  510         error = g_write_data(cp, sc->sc_journal_offset, buf,
  511             cp->provider->sectorsize);
  512         /* if (error == 0) */
  513         sc->sc_journal_offset += cp->provider->sectorsize;
  514 
  515         gj_free(buf, cp->provider->sectorsize);
  516         return (error);
  517 }
  518 
  519 /*
  520  * Every journal record has a header and data following it.
  521  * Functions below are used to decode the header before storing it to
  522  * little endian and to encode it after reading to system endianess.
  523  */
  524 static void
  525 g_journal_record_header_encode(struct g_journal_record_header *hdr,
  526     u_char *data)
  527 {
  528         struct g_journal_entry *ent;
  529         u_int i;
  530 
  531         bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
  532         data += sizeof(GJ_RECORD_HEADER_MAGIC);
  533         le32enc(data, hdr->jrh_journal_id);
  534         data += 8;
  535         le16enc(data, hdr->jrh_nentries);
  536         data += 2;
  537         bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
  538         data += 8;
  539         for (i = 0; i < hdr->jrh_nentries; i++) {
  540                 ent = &hdr->jrh_entries[i];
  541                 le64enc(data, ent->je_joffset);
  542                 data += 8;
  543                 le64enc(data, ent->je_offset);
  544                 data += 8;
  545                 le64enc(data, ent->je_length);
  546                 data += 8;
  547         }
  548 }
  549 
  550 static int
  551 g_journal_record_header_decode(const u_char *data,
  552     struct g_journal_record_header *hdr)
  553 {
  554         struct g_journal_entry *ent;
  555         u_int i;
  556 
  557         bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
  558         data += sizeof(hdr->jrh_magic);
  559         if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
  560                 return (EINVAL);
  561         hdr->jrh_journal_id = le32dec(data);
  562         data += 8;
  563         hdr->jrh_nentries = le16dec(data);
  564         data += 2;
  565         if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
  566                 return (EINVAL);
  567         bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
  568         data += 8;
  569         for (i = 0; i < hdr->jrh_nentries; i++) {
  570                 ent = &hdr->jrh_entries[i];
  571                 ent->je_joffset = le64dec(data);
  572                 data += 8;
  573                 ent->je_offset = le64dec(data);
  574                 data += 8;
  575                 ent->je_length = le64dec(data);
  576                 data += 8;
  577         }
  578         return (0);
  579 }
  580 
  581 /*
  582  * Function reads metadata from a provider (via the given consumer), decodes
  583  * it to system endianess and verifies its correctness.
  584  */
  585 static int
  586 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
  587 {
  588         struct g_provider *pp;
  589         u_char *buf;
  590         int error;
  591 
  592         g_topology_assert();
  593 
  594         error = g_access(cp, 1, 0, 0);
  595         if (error != 0)
  596                 return (error);
  597         pp = cp->provider;
  598         g_topology_unlock();
  599         /* Metadata is stored in last sector. */
  600         buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
  601             &error);
  602         g_topology_lock();
  603         g_access(cp, -1, 0, 0);
  604         if (buf == NULL) {
  605                 GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
  606                     cp->provider->name, error);
  607                 return (error);
  608         }
  609 
  610         /* Decode metadata. */
  611         error = journal_metadata_decode(buf, md);
  612         g_free(buf);
  613         /* Is this is gjournal provider at all? */
  614         if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
  615                 return (EINVAL);
  616         /*
  617          * Are we able to handle this version of metadata?
  618          * We only maintain backward compatibility.
  619          */
  620         if (md->md_version > G_JOURNAL_VERSION) {
  621                 GJ_DEBUG(0,
  622                     "Kernel module is too old to handle metadata from %s.",
  623                     cp->provider->name);
  624                 return (EINVAL);
  625         }
  626         /* Is checksum correct? */
  627         if (error != 0) {
  628                 GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
  629                     cp->provider->name);
  630                 return (error);
  631         }
  632         return (0);
  633 }
  634 
  635 /*
  636  * Two functions below are responsible for updating metadata.
  637  * Only metadata on the data provider is updated (we need to update
  638  * information about active journal in there).
  639  */
  640 static void
  641 g_journal_metadata_done(struct bio *bp)
  642 {
  643 
  644         /*
  645          * There is not much we can do on error except informing about it.
  646          */
  647         if (bp->bio_error != 0) {
  648                 GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
  649                     bp->bio_error);
  650         } else {
  651                 GJ_LOGREQ(2, bp, "Metadata updated.");
  652         }
  653         gj_free(bp->bio_data, bp->bio_length);
  654         g_destroy_bio(bp);
  655 }
  656 
  657 static void
  658 g_journal_metadata_update(struct g_journal_softc *sc)
  659 {
  660         struct g_journal_metadata md;
  661         struct g_consumer *cp;
  662         struct bio *bp;
  663         u_char *sector;
  664 
  665         cp = sc->sc_dconsumer;
  666         sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
  667         strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
  668         md.md_version = G_JOURNAL_VERSION;
  669         md.md_id = sc->sc_id;
  670         md.md_type = sc->sc_orig_type;
  671         md.md_jstart = sc->sc_jstart;
  672         md.md_jend = sc->sc_jend;
  673         md.md_joffset = sc->sc_inactive.jj_offset;
  674         md.md_jid = sc->sc_journal_previous_id;
  675         md.md_flags = 0;
  676         if (sc->sc_flags & GJF_DEVICE_CLEAN)
  677                 md.md_flags |= GJ_FLAG_CLEAN;
  678 
  679         if (sc->sc_flags & GJF_DEVICE_HARDCODED)
  680                 strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
  681         else
  682                 bzero(md.md_provider, sizeof(md.md_provider));
  683         md.md_provsize = cp->provider->mediasize;
  684         journal_metadata_encode(&md, sector);
  685 
  686         /*
  687          * Flush the cache, so we know all data are on disk.
  688          * We write here informations like "journal is consistent", so we need
  689          * to be sure it is. Without BIO_FLUSH here, we can end up in situation
  690          * where metadata is stored on disk, but not all data.
  691          */
  692         g_journal_flush_cache(sc);
  693 
  694         bp = g_alloc_bio();
  695         bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
  696         bp->bio_length = cp->provider->sectorsize;
  697         bp->bio_data = sector;
  698         bp->bio_cmd = BIO_WRITE;
  699         if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
  700                 bp->bio_done = g_journal_metadata_done;
  701                 g_io_request(bp, cp);
  702         } else {
  703                 bp->bio_done = NULL;
  704                 g_io_request(bp, cp);
  705                 biowait(bp, "gjmdu");
  706                 g_journal_metadata_done(bp);
  707         }
  708 
  709         /*
  710          * Be sure metadata reached the disk.
  711          */
  712         g_journal_flush_cache(sc);
  713 }
  714 
  715 /*
  716  * This is where the I/O request comes from the GEOM.
  717  */
  718 static void
  719 g_journal_start(struct bio *bp)
  720 {
  721         struct g_journal_softc *sc;
  722 
  723         sc = bp->bio_to->geom->softc;
  724         GJ_LOGREQ(3, bp, "Request received.");
  725 
  726         switch (bp->bio_cmd) {
  727         case BIO_READ:
  728         case BIO_WRITE:
  729                 mtx_lock(&sc->sc_mtx);
  730                 bioq_insert_tail(&sc->sc_regular_queue, bp);
  731                 wakeup(sc);
  732                 mtx_unlock(&sc->sc_mtx);
  733                 return;
  734         case BIO_GETATTR:
  735                 if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
  736                         strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
  737                         bp->bio_completed = strlen(bp->bio_to->name) + 1;
  738                         g_io_deliver(bp, 0);
  739                         return;
  740                 }
  741                 /* FALLTHROUGH */
  742         case BIO_DELETE:
  743         default:
  744                 g_io_deliver(bp, EOPNOTSUPP);
  745                 return;
  746         }
  747 }
  748 
  749 static void
  750 g_journal_std_done(struct bio *bp)
  751 {
  752         struct g_journal_softc *sc;
  753 
  754         sc = bp->bio_from->geom->softc;
  755         mtx_lock(&sc->sc_mtx);
  756         bioq_insert_tail(&sc->sc_back_queue, bp);
  757         wakeup(sc);
  758         mtx_unlock(&sc->sc_mtx);
  759 }
  760 
  761 static struct bio *
  762 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
  763     int flags)
  764 {
  765         struct bio *bp;
  766 
  767         bp = g_alloc_bio();
  768         bp->bio_offset = start;
  769         bp->bio_joffset = joffset;
  770         bp->bio_length = end - start;
  771         bp->bio_cmd = BIO_WRITE;
  772         bp->bio_done = g_journal_std_done;
  773         if (data == NULL)
  774                 bp->bio_data = NULL;
  775         else {
  776                 bp->bio_data = gj_malloc(bp->bio_length, flags);
  777                 if (bp->bio_data != NULL)
  778                         bcopy(data, bp->bio_data, bp->bio_length);
  779         }
  780         return (bp);
  781 }
  782 
  783 #define g_journal_insert_bio(head, bp, flags)                           \
  784         g_journal_insert((head), (bp)->bio_offset,                      \
  785                 (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \
  786                 (bp)->bio_data, flags)
  787 /*
  788  * The function below does a lot more than just inserting bio to the queue.
  789  * It keeps the queue sorted by offset and ensures that there are no doubled
  790  * data (it combines bios where ranges overlap).
  791  *
  792  * The function returns the number of bios inserted (as bio can be splitted).
  793  */
  794 static int
  795 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
  796     u_char *data, int flags)
  797 {
  798         struct bio *nbp, *cbp, *pbp;
  799         off_t cstart, cend;
  800         u_char *tmpdata;
  801         int n;
  802 
  803         GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
  804             joffset);
  805         n = 0;
  806         pbp = NULL;
  807         GJQ_FOREACH(*head, cbp) {
  808                 cstart = cbp->bio_offset;
  809                 cend = cbp->bio_offset + cbp->bio_length;
  810 
  811                 if (nstart >= cend) {
  812                         /*
  813                          *  +-------------+
  814                          *  |             |
  815                          *  |   current   |  +-------------+
  816                          *  |     bio     |  |             |
  817                          *  |             |  |     new     |
  818                          *  +-------------+  |     bio     |
  819                          *                   |             |
  820                          *                   +-------------+
  821                          */
  822                         GJ_DEBUG(3, "INSERT(%p): 1", *head);
  823                 } else if (nend <= cstart) {
  824                         /*
  825                          *                   +-------------+
  826                          *                   |             |
  827                          *  +-------------+  |   current   |
  828                          *  |             |  |     bio     |
  829                          *  |     new     |  |             |
  830                          *  |     bio     |  +-------------+
  831                          *  |             |
  832                          *  +-------------+
  833                          */
  834                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
  835                             flags);
  836                         if (pbp == NULL)
  837                                 *head = nbp;
  838                         else
  839                                 pbp->bio_next = nbp;
  840                         nbp->bio_next = cbp;
  841                         n++;
  842                         GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
  843                             pbp);
  844                         goto end;
  845                 } else if (nstart <= cstart && nend >= cend) {
  846                         /*
  847                          *      +-------------+      +-------------+
  848                          *      | current bio |      | current bio |
  849                          *  +---+-------------+---+  +-------------+---+
  850                          *  |   |             |   |  |             |   |
  851                          *  |   |             |   |  |             |   |
  852                          *  |   +-------------+   |  +-------------+   |
  853                          *  |       new bio       |  |     new bio     |
  854                          *  +---------------------+  +-----------------+
  855                          *
  856                          *      +-------------+  +-------------+
  857                          *      | current bio |  | current bio |
  858                          *  +---+-------------+  +-------------+
  859                          *  |   |             |  |             |
  860                          *  |   |             |  |             |
  861                          *  |   +-------------+  +-------------+
  862                          *  |     new bio     |  |   new bio   |
  863                          *  +-----------------+  +-------------+
  864                          */
  865                         g_journal_stats_bytes_skipped += cbp->bio_length;
  866                         cbp->bio_offset = nstart;
  867                         cbp->bio_joffset = joffset;
  868                         cbp->bio_length = cend - nstart;
  869                         if (cbp->bio_data != NULL) {
  870                                 gj_free(cbp->bio_data, cend - cstart);
  871                                 cbp->bio_data = NULL;
  872                         }
  873                         if (data != NULL) {
  874                                 cbp->bio_data = gj_malloc(cbp->bio_length,
  875                                     flags);
  876                                 if (cbp->bio_data != NULL) {
  877                                         bcopy(data, cbp->bio_data,
  878                                             cbp->bio_length);
  879                                 }
  880                                 data += cend - nstart;
  881                         }
  882                         joffset += cend - nstart;
  883                         nstart = cend;
  884                         GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
  885                 } else if (nstart > cstart && nend >= cend) {
  886                         /*
  887                          *  +-----------------+  +-------------+
  888                          *  |   current bio   |  | current bio |
  889                          *  |   +-------------+  |   +---------+---+
  890                          *  |   |             |  |   |         |   |
  891                          *  |   |             |  |   |         |   |
  892                          *  +---+-------------+  +---+---------+   |
  893                          *      |   new bio   |      |   new bio   |
  894                          *      +-------------+      +-------------+
  895                          */
  896                         g_journal_stats_bytes_skipped += cend - nstart;
  897                         nbp = g_journal_new_bio(nstart, cend, joffset, data,
  898                             flags);
  899                         nbp->bio_next = cbp->bio_next;
  900                         cbp->bio_next = nbp;
  901                         cbp->bio_length = nstart - cstart;
  902                         if (cbp->bio_data != NULL) {
  903                                 cbp->bio_data = gj_realloc(cbp->bio_data,
  904                                     cbp->bio_length, cend - cstart);
  905                         }
  906                         if (data != NULL)
  907                                 data += cend - nstart;
  908                         joffset += cend - nstart;
  909                         nstart = cend;
  910                         n++;
  911                         GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
  912                 } else if (nstart > cstart && nend < cend) {
  913                         /*
  914                          *  +---------------------+
  915                          *  |     current bio     |
  916                          *  |   +-------------+   |
  917                          *  |   |             |   |
  918                          *  |   |             |   |
  919                          *  +---+-------------+---+
  920                          *      |   new bio   |
  921                          *      +-------------+
  922                          */
  923                         g_journal_stats_bytes_skipped += nend - nstart;
  924                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
  925                             flags);
  926                         nbp->bio_next = cbp->bio_next;
  927                         cbp->bio_next = nbp;
  928                         if (cbp->bio_data == NULL)
  929                                 tmpdata = NULL;
  930                         else
  931                                 tmpdata = cbp->bio_data + nend - cstart;
  932                         nbp = g_journal_new_bio(nend, cend,
  933                             cbp->bio_joffset + nend - cstart, tmpdata, flags);
  934                         nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
  935                         ((struct bio *)cbp->bio_next)->bio_next = nbp;
  936                         cbp->bio_length = nstart - cstart;
  937                         if (cbp->bio_data != NULL) {
  938                                 cbp->bio_data = gj_realloc(cbp->bio_data,
  939                                     cbp->bio_length, cend - cstart);
  940                         }
  941                         n += 2;
  942                         GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
  943                         goto end;
  944                 } else if (nstart <= cstart && nend < cend) {
  945                         /*
  946                          *  +-----------------+      +-------------+
  947                          *  |   current bio   |      | current bio |
  948                          *  +-------------+   |  +---+---------+   |
  949                          *  |             |   |  |   |         |   |
  950                          *  |             |   |  |   |         |   |
  951                          *  +-------------+---+  |   +---------+---+
  952                          *  |   new bio   |      |   new bio   |
  953                          *  +-------------+      +-------------+
  954                          */
  955                         g_journal_stats_bytes_skipped += nend - nstart;
  956                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
  957                             flags);
  958                         if (pbp == NULL)
  959                                 *head = nbp;
  960                         else
  961                                 pbp->bio_next = nbp;
  962                         nbp->bio_next = cbp;
  963                         cbp->bio_offset = nend;
  964                         cbp->bio_length = cend - nend;
  965                         cbp->bio_joffset += nend - cstart;
  966                         tmpdata = cbp->bio_data;
  967                         if (tmpdata != NULL) {
  968                                 cbp->bio_data = gj_malloc(cbp->bio_length,
  969                                     flags);
  970                                 if (cbp->bio_data != NULL) {
  971                                         bcopy(tmpdata + nend - cstart,
  972                                             cbp->bio_data, cbp->bio_length);
  973                                 }
  974                                 gj_free(tmpdata, cend - cstart);
  975                         }
  976                         n++;
  977                         GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
  978                         goto end;
  979                 }
  980                 if (nstart == nend)
  981                         goto end;
  982                 pbp = cbp;
  983         }
  984         nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
  985         if (pbp == NULL)
  986                 *head = nbp;
  987         else
  988                 pbp->bio_next = nbp;
  989         nbp->bio_next = NULL;
  990         n++;
  991         GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
  992 end:
  993         if (g_journal_debug >= 3) {
  994                 GJQ_FOREACH(*head, cbp) {
  995                         GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
  996                             (intmax_t)cbp->bio_offset,
  997                             (intmax_t)cbp->bio_length,
  998                             (intmax_t)cbp->bio_joffset, cbp->bio_data);
  999                 }
 1000                 GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
 1001         }
 1002         return (n);
 1003 }
 1004 
 1005 /*
 1006  * The function combines neighbour bios trying to squeeze as much data as
 1007  * possible into one bio.
 1008  *
 1009  * The function returns the number of bios combined (negative value).
 1010  */
 1011 static int
 1012 g_journal_optimize(struct bio *head)
 1013 {
 1014         struct bio *cbp, *pbp;
 1015         int n;
 1016 
 1017         n = 0;
 1018         pbp = NULL;
 1019         GJQ_FOREACH(head, cbp) {
 1020                 /* Skip bios which has to be read first. */
 1021                 if (cbp->bio_data == NULL) {
 1022                         pbp = NULL;
 1023                         continue;
 1024                 }
 1025                 /* There is no previous bio yet. */
 1026                 if (pbp == NULL) {
 1027                         pbp = cbp;
 1028                         continue;
 1029                 }
 1030                 /* Is this a neighbour bio? */
 1031                 if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
 1032                         /* Be sure that bios queue is sorted. */
 1033                         KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
 1034                             ("poffset=%jd plength=%jd coffset=%jd",
 1035                             (intmax_t)pbp->bio_offset,
 1036                             (intmax_t)pbp->bio_length,
 1037                             (intmax_t)cbp->bio_offset));
 1038                         pbp = cbp;
 1039                         continue;
 1040                 }
 1041                 /* Be sure we don't end up with too big bio. */
 1042                 if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
 1043                         pbp = cbp;
 1044                         continue;
 1045                 }
 1046                 /* Ok, we can join bios. */
 1047                 GJ_LOGREQ(4, pbp, "Join: ");
 1048                 GJ_LOGREQ(4, cbp, "and: ");
 1049                 pbp->bio_data = gj_realloc(pbp->bio_data,
 1050                     pbp->bio_length + cbp->bio_length, pbp->bio_length);
 1051                 bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
 1052                     cbp->bio_length);
 1053                 gj_free(cbp->bio_data, cbp->bio_length);
 1054                 pbp->bio_length += cbp->bio_length;
 1055                 pbp->bio_next = cbp->bio_next;
 1056                 g_destroy_bio(cbp);
 1057                 cbp = pbp;
 1058                 g_journal_stats_combined_ios++;
 1059                 n--;
 1060                 GJ_LOGREQ(4, pbp, "Got: ");
 1061         }
 1062         return (n);
 1063 }
 1064 
 1065 /*
 1066  * TODO: Update comment.
 1067  * These are functions responsible for copying one portion of data from journal
 1068  * to the destination provider.
 1069  * The order goes like this:
 1070  * 1. Read the header, which contains informations about data blocks
 1071  *    following it.
 1072  * 2. Read the data blocks from the journal.
 1073  * 3. Write the data blocks on the data provider.
 1074  *
 1075  * g_journal_copy_start()
 1076  * g_journal_copy_done() - got finished write request, logs potential errors.
 1077  */
 1078 
 1079 /*
 1080  * When there is no data in cache, this function is used to read it.
 1081  */
 1082 static void
 1083 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
 1084 {
 1085         struct bio *cbp;
 1086 
 1087         /*
 1088          * We were short in memory, so data was freed.
 1089          * In that case we need to read it back from journal.
 1090          */
 1091         cbp = g_alloc_bio();
 1092         cbp->bio_cflags = bp->bio_cflags;
 1093         cbp->bio_parent = bp;
 1094         cbp->bio_offset = bp->bio_joffset;
 1095         cbp->bio_length = bp->bio_length;
 1096         cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
 1097         cbp->bio_cmd = BIO_READ;
 1098         cbp->bio_done = g_journal_std_done;
 1099         GJ_LOGREQ(4, cbp, "READ FIRST");
 1100         g_io_request(cbp, sc->sc_jconsumer);
 1101         g_journal_cache_misses++;
 1102 }
 1103 
 1104 static void
 1105 g_journal_copy_send(struct g_journal_softc *sc)
 1106 {
 1107         struct bio *bioq, *bp, *lbp;
 1108 
 1109         bioq = lbp = NULL;
 1110         mtx_lock(&sc->sc_mtx);
 1111         for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
 1112                 bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
 1113                 if (bp == NULL)
 1114                         break;
 1115                 GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
 1116                 sc->sc_copy_in_progress++;
 1117                 GJQ_INSERT_AFTER(bioq, bp, lbp);
 1118                 lbp = bp;
 1119         }
 1120         mtx_unlock(&sc->sc_mtx);
 1121         if (g_journal_do_optimize)
 1122                 sc->sc_copy_in_progress += g_journal_optimize(bioq);
 1123         while ((bp = GJQ_FIRST(bioq)) != NULL) {
 1124                 GJQ_REMOVE(bioq, bp);
 1125                 GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
 1126                 bp->bio_cflags = GJ_BIO_COPY;
 1127                 if (bp->bio_data == NULL)
 1128                         g_journal_read_first(sc, bp);
 1129                 else {
 1130                         bp->bio_joffset = 0;
 1131                         GJ_LOGREQ(4, bp, "SEND");
 1132                         g_io_request(bp, sc->sc_dconsumer);
 1133                 }
 1134         }
 1135 }
 1136 
 1137 static void
 1138 g_journal_copy_start(struct g_journal_softc *sc)
 1139 {
 1140 
 1141         /*
 1142          * Remember in metadata that we're starting to copy journaled data
 1143          * to the data provider.
 1144          * In case of power failure, we will copy these data once again on boot.
 1145          */
 1146         if (!sc->sc_journal_copying) {
 1147                 sc->sc_journal_copying = 1;
 1148                 GJ_DEBUG(1, "Starting copy of journal.");
 1149                 g_journal_metadata_update(sc);
 1150         }
 1151         g_journal_copy_send(sc);
 1152 }
 1153 
 1154 /*
 1155  * Data block has been read from the journal provider.
 1156  */
 1157 static int
 1158 g_journal_copy_read_done(struct bio *bp)
 1159 {
 1160         struct g_journal_softc *sc;
 1161         struct g_consumer *cp;
 1162         struct bio *pbp;
 1163 
 1164         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 1165             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 1166 
 1167         sc = bp->bio_from->geom->softc;
 1168         pbp = bp->bio_parent;
 1169 
 1170         if (bp->bio_error != 0) {
 1171                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 1172                     bp->bio_to->name, bp->bio_error);
 1173                 /*
 1174                  * We will not be able to deliver WRITE request as well.
 1175                  */
 1176                 gj_free(bp->bio_data, bp->bio_length);
 1177                 g_destroy_bio(pbp);
 1178                 g_destroy_bio(bp);
 1179                 sc->sc_copy_in_progress--;
 1180                 return (1);
 1181         }
 1182         pbp->bio_data = bp->bio_data;
 1183         cp = sc->sc_dconsumer;
 1184         g_io_request(pbp, cp);
 1185         GJ_LOGREQ(4, bp, "READ DONE");
 1186         g_destroy_bio(bp);
 1187         return (0);
 1188 }
 1189 
 1190 /*
 1191  * Data block has been written to the data provider.
 1192  */
 1193 static void
 1194 g_journal_copy_write_done(struct bio *bp)
 1195 {
 1196         struct g_journal_softc *sc;
 1197 
 1198         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 1199             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 1200 
 1201         sc = bp->bio_from->geom->softc;
 1202         sc->sc_copy_in_progress--;
 1203 
 1204         if (bp->bio_error != 0) {
 1205                 GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
 1206                     bp->bio_error);
 1207         }
 1208         GJQ_REMOVE(sc->sc_copy_queue, bp);
 1209         gj_free(bp->bio_data, bp->bio_length);
 1210         GJ_LOGREQ(4, bp, "DONE");
 1211         g_destroy_bio(bp);
 1212 
 1213         if (sc->sc_copy_in_progress == 0) {
 1214                 /*
 1215                  * This was the last write request for this journal.
 1216                  */
 1217                 GJ_DEBUG(1, "Data has been copied.");
 1218                 sc->sc_journal_copying = 0;
 1219         }
 1220 }
 1221 
 1222 static void g_journal_flush_done(struct bio *bp);
 1223 
 1224 /*
 1225  * Flush one record onto active journal provider.
 1226  */
 1227 static void
 1228 g_journal_flush(struct g_journal_softc *sc)
 1229 {
 1230         struct g_journal_record_header hdr;
 1231         struct g_journal_entry *ent;
 1232         struct g_provider *pp;
 1233         struct bio **bioq;
 1234         struct bio *bp, *fbp, *pbp;
 1235         off_t joffset, size;
 1236         u_char *data, hash[16];
 1237         MD5_CTX ctx;
 1238         u_int i;
 1239 
 1240         if (sc->sc_current_count == 0)
 1241                 return;
 1242 
 1243         size = 0;
 1244         pp = sc->sc_jprovider;
 1245         GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 1246         joffset = sc->sc_journal_offset;
 1247 
 1248         GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
 1249             sc->sc_current_count, pp->name, (intmax_t)joffset);
 1250 
 1251         /*
 1252          * Store 'journal id', so we know to which journal this record belongs.
 1253          */
 1254         hdr.jrh_journal_id = sc->sc_journal_id;
 1255         /* Could be less than g_journal_record_entries if called due timeout. */
 1256         hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
 1257         strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
 1258 
 1259         bioq = &sc->sc_active.jj_queue;
 1260         pbp = sc->sc_flush_queue;
 1261 
 1262         fbp = g_alloc_bio();
 1263         fbp->bio_parent = NULL;
 1264         fbp->bio_cflags = GJ_BIO_JOURNAL;
 1265         fbp->bio_offset = -1;
 1266         fbp->bio_joffset = joffset;
 1267         fbp->bio_length = pp->sectorsize;
 1268         fbp->bio_cmd = BIO_WRITE;
 1269         fbp->bio_done = g_journal_std_done;
 1270         GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
 1271         pbp = fbp;
 1272         fbp->bio_to = pp;
 1273         GJ_LOGREQ(4, fbp, "FLUSH_OUT");
 1274         joffset += pp->sectorsize;
 1275         sc->sc_flush_count++;
 1276         if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 1277                 MD5Init(&ctx);
 1278 
 1279         for (i = 0; i < hdr.jrh_nentries; i++) {
 1280                 bp = sc->sc_current_queue;
 1281                 KASSERT(bp != NULL, ("NULL bp"));
 1282                 bp->bio_to = pp;
 1283                 GJ_LOGREQ(4, bp, "FLUSHED");
 1284                 sc->sc_current_queue = bp->bio_next;
 1285                 bp->bio_next = NULL;
 1286                 sc->sc_current_count--;
 1287 
 1288                 /* Add to the header. */
 1289                 ent = &hdr.jrh_entries[i];
 1290                 ent->je_offset = bp->bio_offset;
 1291                 ent->je_joffset = joffset;
 1292                 ent->je_length = bp->bio_length;
 1293                 size += ent->je_length;
 1294 
 1295                 data = bp->bio_data;
 1296                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 1297                         MD5Update(&ctx, data, ent->je_length);
 1298                 bzero(bp, sizeof(*bp));
 1299                 bp->bio_cflags = GJ_BIO_JOURNAL;
 1300                 bp->bio_offset = ent->je_offset;
 1301                 bp->bio_joffset = ent->je_joffset;
 1302                 bp->bio_length = ent->je_length;
 1303                 bp->bio_data = data;
 1304                 bp->bio_cmd = BIO_WRITE;
 1305                 bp->bio_done = g_journal_std_done;
 1306                 GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
 1307                 pbp = bp;
 1308                 bp->bio_to = pp;
 1309                 GJ_LOGREQ(4, bp, "FLUSH_OUT");
 1310                 joffset += bp->bio_length;
 1311                 sc->sc_flush_count++;
 1312 
 1313                 /*
 1314                  * Add request to the active sc_journal_queue queue.
 1315                  * This is our cache. After journal switch we don't have to
 1316                  * read the data from the inactive journal, because we keep
 1317                  * it in memory.
 1318                  */
 1319                 g_journal_insert(bioq, ent->je_offset,
 1320                     ent->je_offset + ent->je_length, ent->je_joffset, data,
 1321                     M_NOWAIT);
 1322         }
 1323 
 1324         /*
 1325          * After all requests, store valid header.
 1326          */
 1327         data = gj_malloc(pp->sectorsize, M_WAITOK);
 1328         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 1329                 MD5Final(hash, &ctx);
 1330                 bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
 1331         }
 1332         g_journal_record_header_encode(&hdr, data);
 1333         fbp->bio_data = data;
 1334 
 1335         sc->sc_journal_offset = joffset;
 1336 
 1337         g_journal_check_overflow(sc);
 1338 }
 1339 
 1340 /*
 1341  * Flush request finished.
 1342  */
 1343 static void
 1344 g_journal_flush_done(struct bio *bp)
 1345 {
 1346         struct g_journal_softc *sc;
 1347         struct g_consumer *cp;
 1348 
 1349         KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
 1350             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
 1351 
 1352         cp = bp->bio_from;
 1353         sc = cp->geom->softc;
 1354         sc->sc_flush_in_progress--;
 1355 
 1356         if (bp->bio_error != 0) {
 1357                 GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
 1358                     bp->bio_error);
 1359         }
 1360         gj_free(bp->bio_data, bp->bio_length);
 1361         GJ_LOGREQ(4, bp, "DONE");
 1362         g_destroy_bio(bp);
 1363 }
 1364 
 1365 static void g_journal_release_delayed(struct g_journal_softc *sc);
 1366 
 1367 static void
 1368 g_journal_flush_send(struct g_journal_softc *sc)
 1369 {
 1370         struct g_consumer *cp;
 1371         struct bio *bioq, *bp, *lbp;
 1372 
 1373         cp = sc->sc_jconsumer;
 1374         bioq = lbp = NULL;
 1375         while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
 1376                 /* Send one flush requests to the active journal. */
 1377                 bp = GJQ_FIRST(sc->sc_flush_queue);
 1378                 if (bp != NULL) {
 1379                         GJQ_REMOVE(sc->sc_flush_queue, bp);
 1380                         sc->sc_flush_count--;
 1381                         bp->bio_offset = bp->bio_joffset;
 1382                         bp->bio_joffset = 0;
 1383                         sc->sc_flush_in_progress++;
 1384                         GJQ_INSERT_AFTER(bioq, bp, lbp);
 1385                         lbp = bp;
 1386                 }
 1387                 /* Try to release delayed requests. */
 1388                 g_journal_release_delayed(sc);
 1389                 /* If there are no requests to flush, leave. */
 1390                 if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
 1391                         break;
 1392         }
 1393         if (g_journal_do_optimize)
 1394                 sc->sc_flush_in_progress += g_journal_optimize(bioq);
 1395         while ((bp = GJQ_FIRST(bioq)) != NULL) {
 1396                 GJQ_REMOVE(bioq, bp);
 1397                 GJ_LOGREQ(3, bp, "Flush request send");
 1398                 g_io_request(bp, cp);
 1399         }
 1400 }
 1401 
 1402 static void
 1403 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
 1404 {
 1405         int n;
 1406 
 1407         GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
 1408         n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
 1409         sc->sc_current_count += n;
 1410         n = g_journal_optimize(sc->sc_current_queue);
 1411         sc->sc_current_count += n;
 1412         /*
 1413          * For requests which are added to the current queue we deliver
 1414          * response immediately.
 1415          */
 1416         bp->bio_completed = bp->bio_length;
 1417         g_io_deliver(bp, 0);
 1418         if (sc->sc_current_count >= g_journal_record_entries) {
 1419                 /*
 1420                  * Let's flush one record onto active journal provider.
 1421                  */
 1422                 g_journal_flush(sc);
 1423         }
 1424 }
 1425 
 1426 static void
 1427 g_journal_release_delayed(struct g_journal_softc *sc)
 1428 {
 1429         struct bio *bp;
 1430 
 1431         for (;;) {
 1432                 /* The flush queue is full, exit. */
 1433                 if (sc->sc_flush_count >= g_journal_accept_immediately)
 1434                         return;
 1435                 bp = bioq_takefirst(&sc->sc_delayed_queue);
 1436                 if (bp == NULL)
 1437                         return;
 1438                 sc->sc_delayed_count--;
 1439                 g_journal_add_current(sc, bp);
 1440         }
 1441 }
 1442 
 1443 /*
 1444  * Add I/O request to the current queue. If we have enough requests for one
 1445  * journal record we flush them onto active journal provider.
 1446  */
 1447 static void
 1448 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
 1449 {
 1450 
 1451         /*
 1452          * The flush queue is full, we need to delay the request.
 1453          */
 1454         if (sc->sc_delayed_count > 0 ||
 1455             sc->sc_flush_count >= g_journal_accept_immediately) {
 1456                 GJ_LOGREQ(4, bp, "DELAYED");
 1457                 bioq_insert_tail(&sc->sc_delayed_queue, bp);
 1458                 sc->sc_delayed_count++;
 1459                 return;
 1460         }
 1461 
 1462         KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
 1463             ("DELAYED queue not empty."));
 1464         g_journal_add_current(sc, bp);
 1465 }
 1466 
 1467 static void g_journal_read_done(struct bio *bp);
 1468 
 1469 /*
 1470  * Try to find requested data in cache.
 1471  */
 1472 static struct bio *
 1473 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
 1474     off_t oend)
 1475 {
 1476         off_t cstart, cend;
 1477         struct bio *bp;
 1478 
 1479         GJQ_FOREACH(head, bp) {
 1480                 if (bp->bio_offset == -1)
 1481                         continue;
 1482                 cstart = MAX(ostart, bp->bio_offset);
 1483                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
 1484                 if (cend <= ostart)
 1485                         continue;
 1486                 else if (cstart >= oend) {
 1487                         if (!sorted)
 1488                                 continue;
 1489                         else {
 1490                                 bp = NULL;
 1491                                 break;
 1492                         }
 1493                 }
 1494                 if (bp->bio_data == NULL)
 1495                         break;
 1496                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 1497                     bp);
 1498                 bcopy(bp->bio_data + cstart - bp->bio_offset,
 1499                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 1500                 pbp->bio_completed += cend - cstart;
 1501                 if (pbp->bio_completed == pbp->bio_length) {
 1502                         /*
 1503                          * Cool, the whole request was in cache, deliver happy
 1504                          * message.
 1505                          */
 1506                         g_io_deliver(pbp, 0);
 1507                         return (pbp);
 1508                 }
 1509                 break;
 1510         }
 1511         return (bp);
 1512 }
 1513 
 1514 /*
 1515  * Try to find requested data in cache.
 1516  */
 1517 static struct bio *
 1518 g_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart,
 1519     off_t oend)
 1520 {
 1521         off_t cstart, cend;
 1522         struct bio *bp;
 1523 
 1524         TAILQ_FOREACH(bp, head, bio_queue) {
 1525                 cstart = MAX(ostart, bp->bio_offset);
 1526                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
 1527                 if (cend <= ostart)
 1528                         continue;
 1529                 else if (cstart >= oend)
 1530                         continue;
 1531                 KASSERT(bp->bio_data != NULL,
 1532                     ("%s: bio_data == NULL", __func__));
 1533                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 1534                     bp);
 1535                 bcopy(bp->bio_data + cstart - bp->bio_offset,
 1536                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 1537                 pbp->bio_completed += cend - cstart;
 1538                 if (pbp->bio_completed == pbp->bio_length) {
 1539                         /*
 1540                          * Cool, the whole request was in cache, deliver happy
 1541                          * message.
 1542                          */
 1543                         g_io_deliver(pbp, 0);
 1544                         return (pbp);
 1545                 }
 1546                 break;
 1547         }
 1548         return (bp);
 1549 }
 1550 
 1551 /*
 1552  * This function is used for colecting data on read.
 1553  * The complexity is because parts of the data can be stored in four different
 1554  * places:
 1555  * - in delayed requests
 1556  * - in memory - the data not yet send to the active journal provider
 1557  * - in requests which are going to be sent to the active journal
 1558  * - in the active journal
 1559  * - in the inactive journal
 1560  * - in the data provider
 1561  */
 1562 static void
 1563 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
 1564     off_t oend)
 1565 {
 1566         struct bio *bp, *nbp, *head;
 1567         off_t cstart, cend;
 1568         u_int i, sorted = 0;
 1569 
 1570         GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
 1571 
 1572         cstart = cend = -1;
 1573         bp = NULL;
 1574         head = NULL;
 1575         for (i = 0; i <= 5; i++) {
 1576                 switch (i) {
 1577                 case 0: /* Delayed requests. */
 1578                         head = NULL;
 1579                         sorted = 0;
 1580                         break;
 1581                 case 1: /* Not-yet-send data. */
 1582                         head = sc->sc_current_queue;
 1583                         sorted = 1;
 1584                         break;
 1585                 case 2: /* In-flight to the active journal. */
 1586                         head = sc->sc_flush_queue;
 1587                         sorted = 0;
 1588                         break;
 1589                 case 3: /* Active journal. */
 1590                         head = sc->sc_active.jj_queue;
 1591                         sorted = 1;
 1592                         break;
 1593                 case 4: /* Inactive journal. */
 1594                         /*
 1595                          * XXX: Here could be a race with g_journal_lowmem().
 1596                          */
 1597                         head = sc->sc_inactive.jj_queue;
 1598                         sorted = 1;
 1599                         break;
 1600                 case 5: /* In-flight to the data provider. */
 1601                         head = sc->sc_copy_queue;
 1602                         sorted = 0;
 1603                         break;
 1604                 default:
 1605                         panic("gjournal %s: i=%d", __func__, i);
 1606                 }
 1607                 if (i == 0)
 1608                         bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend);
 1609                 else
 1610                         bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
 1611                 if (bp == pbp) { /* Got the whole request. */
 1612                         GJ_DEBUG(2, "Got the whole request from %u.", i);
 1613                         return;
 1614                 } else if (bp != NULL) {
 1615                         cstart = MAX(ostart, bp->bio_offset);
 1616                         cend = MIN(oend, bp->bio_offset + bp->bio_length);
 1617                         GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
 1618                             i, (intmax_t)cstart, (intmax_t)cend);
 1619                         break;
 1620                 }
 1621         }
 1622         if (bp != NULL) {
 1623                 if (bp->bio_data == NULL) {
 1624                         nbp = g_duplicate_bio(pbp);
 1625                         nbp->bio_cflags = GJ_BIO_READ;
 1626                         nbp->bio_data =
 1627                             pbp->bio_data + cstart - pbp->bio_offset;
 1628                         nbp->bio_offset =
 1629                             bp->bio_joffset + cstart - bp->bio_offset;
 1630                         nbp->bio_length = cend - cstart;
 1631                         nbp->bio_done = g_journal_read_done;
 1632                         g_io_request(nbp, sc->sc_jconsumer);
 1633                 }
 1634                 /*
 1635                  * If we don't have the whole request yet, call g_journal_read()
 1636                  * recursively.
 1637                  */
 1638                 if (ostart < cstart)
 1639                         g_journal_read(sc, pbp, ostart, cstart);
 1640                 if (oend > cend)
 1641                         g_journal_read(sc, pbp, cend, oend);
 1642         } else {
 1643                 /*
 1644                  * No data in memory, no data in journal.
 1645                  * Its time for asking data provider.
 1646                  */
 1647                 GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
 1648                 nbp = g_duplicate_bio(pbp);
 1649                 nbp->bio_cflags = GJ_BIO_READ;
 1650                 nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
 1651                 nbp->bio_offset = ostart;
 1652                 nbp->bio_length = oend - ostart;
 1653                 nbp->bio_done = g_journal_read_done;
 1654                 g_io_request(nbp, sc->sc_dconsumer);
 1655                 /* We have the whole request, return here. */
 1656                 return;
 1657         }
 1658 }
 1659 
 1660 /*
 1661  * Function responsible for handling finished READ requests.
 1662  * Actually, g_std_done() could be used here, the only difference is that we
 1663  * log error.
 1664  */
 1665 static void
 1666 g_journal_read_done(struct bio *bp)
 1667 {
 1668         struct bio *pbp;
 1669 
 1670         KASSERT(bp->bio_cflags == GJ_BIO_READ,
 1671             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
 1672 
 1673         pbp = bp->bio_parent;
 1674         pbp->bio_inbed++;
 1675         pbp->bio_completed += bp->bio_length;
 1676 
 1677         if (bp->bio_error != 0) {
 1678                 if (pbp->bio_error == 0)
 1679                         pbp->bio_error = bp->bio_error;
 1680                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 1681                     bp->bio_to->name, bp->bio_error);
 1682         }
 1683         g_destroy_bio(bp);
 1684         if (pbp->bio_children == pbp->bio_inbed &&
 1685             pbp->bio_completed == pbp->bio_length) {
 1686                 /* We're done. */
 1687                 g_io_deliver(pbp, 0);
 1688         }
 1689 }
 1690 
 1691 /*
 1692  * Deactive current journal and active next one.
 1693  */
 1694 static void
 1695 g_journal_switch(struct g_journal_softc *sc)
 1696 {
 1697         struct g_provider *pp;
 1698 
 1699         if (JEMPTY(sc)) {
 1700                 GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 1701                 pp = LIST_FIRST(&sc->sc_geom->provider);
 1702                 if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
 1703                         sc->sc_flags |= GJF_DEVICE_CLEAN;
 1704                         GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 1705                         g_journal_metadata_update(sc);
 1706                 }
 1707         } else {
 1708                 GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
 1709 
 1710                 pp = sc->sc_jprovider;
 1711 
 1712                 sc->sc_journal_previous_id = sc->sc_journal_id;
 1713 
 1714                 sc->sc_journal_id = sc->sc_journal_next_id;
 1715                 sc->sc_journal_next_id = arc4random();
 1716 
 1717                 GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 1718 
 1719                 g_journal_write_header(sc);
 1720 
 1721                 sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
 1722                 sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
 1723 
 1724                 sc->sc_active.jj_offset =
 1725                     sc->sc_journal_offset - pp->sectorsize;
 1726                 sc->sc_active.jj_queue = NULL;
 1727 
 1728                 /*
 1729                  * Switch is done, start copying data from the (now) inactive
 1730                  * journal to the data provider.
 1731                  */
 1732                 g_journal_copy_start(sc);
 1733         }
 1734         mtx_lock(&sc->sc_mtx);
 1735         sc->sc_flags &= ~GJF_DEVICE_SWITCH;
 1736         mtx_unlock(&sc->sc_mtx);
 1737 }
 1738 
 1739 static void
 1740 g_journal_initialize(struct g_journal_softc *sc)
 1741 {
 1742 
 1743         sc->sc_journal_id = arc4random();
 1744         sc->sc_journal_next_id = arc4random();
 1745         sc->sc_journal_previous_id = sc->sc_journal_id;
 1746         sc->sc_journal_offset = sc->sc_jstart;
 1747         sc->sc_inactive.jj_offset = sc->sc_jstart;
 1748         g_journal_write_header(sc);
 1749         sc->sc_active.jj_offset = sc->sc_jstart;
 1750 }
 1751 
 1752 static void
 1753 g_journal_mark_as_dirty(struct g_journal_softc *sc)
 1754 {
 1755         const struct g_journal_desc *desc;
 1756         int i;
 1757 
 1758         GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
 1759         for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
 1760                 desc->jd_dirty(sc->sc_dconsumer);
 1761 }
 1762 
 1763 /*
 1764  * Function read record header from the given journal.
 1765  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
 1766  * and data on every call.
 1767  */
 1768 static int
 1769 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
 1770     void *data)
 1771 {
 1772         int error;
 1773 
 1774         bzero(bp, sizeof(*bp));
 1775         bp->bio_cmd = BIO_READ;
 1776         bp->bio_done = NULL;
 1777         bp->bio_offset = offset;
 1778         bp->bio_length = cp->provider->sectorsize;
 1779         bp->bio_data = data;
 1780         g_io_request(bp, cp);
 1781         error = biowait(bp, "gjs_read");
 1782         return (error);
 1783 }
 1784 
 1785 #if 0
 1786 /*
 1787  * Function is called when we start the journal device and we detect that
 1788  * one of the journals was not fully copied.
 1789  * The purpose of this function is to read all records headers from journal
 1790  * and placed them in the inactive queue, so we can start journal
 1791  * synchronization process and the journal provider itself.
 1792  * Design decision was taken to not synchronize the whole journal here as it
 1793  * can take too much time. Reading headers only and delaying synchronization
 1794  * process until after journal provider is started should be the best choice.
 1795  */
 1796 #endif
 1797 
 1798 static void
 1799 g_journal_sync(struct g_journal_softc *sc)
 1800 {
 1801         struct g_journal_record_header rhdr;
 1802         struct g_journal_entry *ent;
 1803         struct g_journal_header jhdr;
 1804         struct g_consumer *cp;
 1805         struct bio *bp, *fbp, *tbp;
 1806         off_t joffset, offset;
 1807         u_char *buf, sum[16];
 1808         uint64_t id;
 1809         MD5_CTX ctx;
 1810         int error, found, i;
 1811 
 1812         found = 0;
 1813         fbp = NULL;
 1814         cp = sc->sc_jconsumer;
 1815         bp = g_alloc_bio();
 1816         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 1817         offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
 1818 
 1819         GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
 1820 
 1821         /*
 1822          * Read and decode first journal header.
 1823          */
 1824         error = g_journal_sync_read(cp, bp, offset, buf);
 1825         if (error != 0) {
 1826                 GJ_DEBUG(0, "Error while reading journal header from %s.",
 1827                     cp->provider->name);
 1828                 goto end;
 1829         }
 1830         error = g_journal_header_decode(buf, &jhdr);
 1831         if (error != 0) {
 1832                 GJ_DEBUG(0, "Cannot decode journal header from %s.",
 1833                     cp->provider->name);
 1834                 goto end;
 1835         }
 1836         id = sc->sc_journal_id;
 1837         if (jhdr.jh_journal_id != sc->sc_journal_id) {
 1838                 GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
 1839                     (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
 1840                 goto end;
 1841         }
 1842         offset += cp->provider->sectorsize;
 1843         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 1844 
 1845         for (;;) {
 1846                 /*
 1847                  * If the biggest record won't fit, look for a record header or
 1848                  * journal header from the begining.
 1849                  */
 1850                 GJ_VALIDATE_OFFSET(offset, sc);
 1851                 error = g_journal_sync_read(cp, bp, offset, buf);
 1852                 if (error != 0) {
 1853                         /*
 1854                          * Not good. Having an error while reading header
 1855                          * means, that we cannot read next headers and in
 1856                          * consequence we cannot find termination.
 1857                          */
 1858                         GJ_DEBUG(0,
 1859                             "Error while reading record header from %s.",
 1860                             cp->provider->name);
 1861                         break;
 1862                 }
 1863 
 1864                 error = g_journal_record_header_decode(buf, &rhdr);
 1865                 if (error != 0) {
 1866                         GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
 1867                             (intmax_t)offset, error);
 1868                         /*
 1869                          * This is not a record header.
 1870                          * If we are lucky, this is next journal header.
 1871                          */
 1872                         error = g_journal_header_decode(buf, &jhdr);
 1873                         if (error != 0) {
 1874                                 GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
 1875                                     (intmax_t)offset, error);
 1876                                 /*
 1877                                  * Nope, this is not journal header, which
 1878                                  * bascially means that journal is not
 1879                                  * terminated properly.
 1880                                  */
 1881                                 error = ENOENT;
 1882                                 break;
 1883                         }
 1884                         /*
 1885                          * Ok. This is header of _some_ journal. Now we need to
 1886                          * verify if this is header of the _next_ journal.
 1887                          */
 1888                         if (jhdr.jh_journal_id != id) {
 1889                                 GJ_DEBUG(1, "Journal ID mismatch at %jd "
 1890                                     "(0x%08x != 0x%08x).", (intmax_t)offset,
 1891                                     (u_int)jhdr.jh_journal_id, (u_int)id);
 1892                                 error = ENOENT;
 1893                                 break;
 1894                         }
 1895 
 1896                         /* Found termination. */
 1897                         found++;
 1898                         GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
 1899                             (intmax_t)offset, (u_int)id);
 1900                         sc->sc_active.jj_offset = offset;
 1901                         sc->sc_journal_offset =
 1902                             offset + cp->provider->sectorsize;
 1903                         sc->sc_journal_id = id;
 1904                         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 1905 
 1906                         while ((tbp = fbp) != NULL) {
 1907                                 fbp = tbp->bio_next;
 1908                                 GJ_LOGREQ(3, tbp, "Adding request.");
 1909                                 g_journal_insert_bio(&sc->sc_inactive.jj_queue,
 1910                                     tbp, M_WAITOK);
 1911                         }
 1912 
 1913                         /* Skip journal's header. */
 1914                         offset += cp->provider->sectorsize;
 1915                         continue;
 1916                 }
 1917 
 1918                 /* Skip record's header. */
 1919                 offset += cp->provider->sectorsize;
 1920 
 1921                 /*
 1922                  * Add information about every record entry to the inactive
 1923                  * queue.
 1924                  */
 1925                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 1926                         MD5Init(&ctx);
 1927                 for (i = 0; i < rhdr.jrh_nentries; i++) {
 1928                         ent = &rhdr.jrh_entries[i];
 1929                         GJ_DEBUG(3, "Insert entry: %jd %jd.",
 1930                             (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
 1931                         g_journal_insert(&fbp, ent->je_offset,
 1932                             ent->je_offset + ent->je_length, ent->je_joffset,
 1933                             NULL, M_WAITOK);
 1934                         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 1935                                 u_char *buf2;
 1936 
 1937                                 /*
 1938                                  * TODO: Should use faster function (like
 1939                                  *       g_journal_sync_read()).
 1940                                  */
 1941                                 buf2 = g_read_data(cp, offset, ent->je_length,
 1942                                     NULL);
 1943                                 if (buf2 == NULL)
 1944                                         GJ_DEBUG(0, "Cannot read data at %jd.",
 1945                                             (intmax_t)offset);
 1946                                 else {
 1947                                         MD5Update(&ctx, buf2, ent->je_length);
 1948                                         g_free(buf2);
 1949                                 }
 1950                         }
 1951                         /* Skip entry's data. */
 1952                         offset += ent->je_length;
 1953                 }
 1954                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 1955                         MD5Final(sum, &ctx);
 1956                         if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
 1957                                 GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
 1958                                     (intmax_t)offset);
 1959                         }
 1960                 }
 1961         }
 1962 end:
 1963         gj_free(bp->bio_data, cp->provider->sectorsize);
 1964         g_destroy_bio(bp);
 1965 
 1966         /* Remove bios from unterminated journal. */
 1967         while ((tbp = fbp) != NULL) {
 1968                 fbp = tbp->bio_next;
 1969                 g_destroy_bio(tbp);
 1970         }
 1971 
 1972         if (found < 1 && joffset > 0) {
 1973                 GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
 1974                     sc->sc_name);
 1975                 while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
 1976                         sc->sc_inactive.jj_queue = tbp->bio_next;
 1977                         g_destroy_bio(tbp);
 1978                 }
 1979                 g_journal_initialize(sc);
 1980                 g_journal_mark_as_dirty(sc);
 1981         } else {
 1982                 GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
 1983                 g_journal_copy_start(sc);
 1984         }
 1985 }
 1986 
 1987 /*
 1988  * Wait for requests.
 1989  * If we have requests in the current queue, flush them after 3 seconds from the
 1990  * last flush. In this way we don't wait forever (or for journal switch) with
 1991  * storing not full records on journal.
 1992  */
 1993 static void
 1994 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
 1995 {
 1996         int error, timeout;
 1997 
 1998         GJ_DEBUG(3, "%s: enter", __func__);
 1999         if (sc->sc_current_count == 0) {
 2000                 if (g_journal_debug < 2)
 2001                         msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
 2002                 else {
 2003                         /*
 2004                          * If we have debug turned on, show number of elements
 2005                          * in various queues.
 2006                          */
 2007                         for (;;) {
 2008                                 error = msleep(sc, &sc->sc_mtx, PRIBIO,
 2009                                     "gj:work", hz * 3);
 2010                                 if (error == 0) {
 2011                                         mtx_unlock(&sc->sc_mtx);
 2012                                         break;
 2013                                 }
 2014                                 GJ_DEBUG(3, "Report: current count=%d",
 2015                                     sc->sc_current_count);
 2016                                 GJ_DEBUG(3, "Report: flush count=%d",
 2017                                     sc->sc_flush_count);
 2018                                 GJ_DEBUG(3, "Report: flush in progress=%d",
 2019                                     sc->sc_flush_in_progress);
 2020                                 GJ_DEBUG(3, "Report: copy in progress=%d",
 2021                                     sc->sc_copy_in_progress);
 2022                                 GJ_DEBUG(3, "Report: delayed=%d",
 2023                                     sc->sc_delayed_count);
 2024                         }
 2025                 }
 2026                 GJ_DEBUG(3, "%s: exit 1", __func__);
 2027                 return;
 2028         }
 2029 
 2030         /*
 2031          * Flush even not full records every 3 seconds.
 2032          */
 2033         timeout = (last_write + 3 - time_second) * hz;
 2034         if (timeout <= 0) {
 2035                 mtx_unlock(&sc->sc_mtx);
 2036                 g_journal_flush(sc);
 2037                 g_journal_flush_send(sc);
 2038                 GJ_DEBUG(3, "%s: exit 2", __func__);
 2039                 return;
 2040         }
 2041         error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
 2042         if (error == EWOULDBLOCK)
 2043                 g_journal_flush_send(sc);
 2044         GJ_DEBUG(3, "%s: exit 3", __func__);
 2045 }
 2046 
 2047 /*
 2048  * Worker thread.
 2049  */
 2050 static void
 2051 g_journal_worker(void *arg)
 2052 {
 2053         struct g_journal_softc *sc;
 2054         struct g_geom *gp;
 2055         struct g_provider *pp;
 2056         struct bio *bp;
 2057         time_t last_write;
 2058         int type;
 2059 
 2060         thread_lock(curthread);
 2061         sched_prio(curthread, PRIBIO);
 2062         thread_unlock(curthread);
 2063 
 2064         sc = arg;
 2065         type = 0;       /* gcc */
 2066 
 2067         if (sc->sc_flags & GJF_DEVICE_CLEAN) {
 2068                 GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
 2069                 g_journal_initialize(sc);
 2070         } else {
 2071                 g_journal_sync(sc);
 2072         }
 2073         /*
 2074          * Check if we can use BIO_FLUSH.
 2075          */
 2076         sc->sc_bio_flush = 0;
 2077         if (g_io_flush(sc->sc_jconsumer) == 0) {
 2078                 sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
 2079                 GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 2080                     sc->sc_jconsumer->provider->name);
 2081         } else {
 2082                 GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 2083                     sc->sc_jconsumer->provider->name);
 2084         }
 2085         if (sc->sc_jconsumer != sc->sc_dconsumer) {
 2086                 if (g_io_flush(sc->sc_dconsumer) == 0) {
 2087                         sc->sc_bio_flush |= GJ_FLUSH_DATA;
 2088                         GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 2089                             sc->sc_dconsumer->provider->name);
 2090                 } else {
 2091                         GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 2092                             sc->sc_dconsumer->provider->name);
 2093                 }
 2094         }
 2095 
 2096         gp = sc->sc_geom;
 2097         g_topology_lock();
 2098         pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
 2099         KASSERT(pp != NULL, ("Cannot create %s.journal.", sc->sc_name));
 2100         pp->mediasize = sc->sc_mediasize;
 2101         /*
 2102          * There could be a problem when data provider and journal providers
 2103          * have different sectorsize, but such scenario is prevented on journal
 2104          * creation.
 2105          */
 2106         pp->sectorsize = sc->sc_sectorsize;
 2107         g_error_provider(pp, 0);
 2108         g_topology_unlock();
 2109         last_write = time_second;
 2110 
 2111         if (sc->sc_rootmount != NULL) {
 2112                 GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 2113                 root_mount_rel(sc->sc_rootmount);
 2114                 sc->sc_rootmount = NULL;
 2115         }
 2116 
 2117         for (;;) {
 2118                 /* Get first request from the queue. */
 2119                 mtx_lock(&sc->sc_mtx);
 2120                 bp = bioq_first(&sc->sc_back_queue);
 2121                 if (bp != NULL)
 2122                         type = (bp->bio_cflags & GJ_BIO_MASK);
 2123                 if (bp == NULL) {
 2124                         bp = bioq_first(&sc->sc_regular_queue);
 2125                         if (bp != NULL)
 2126                                 type = GJ_BIO_REGULAR;
 2127                 }
 2128                 if (bp == NULL) {
 2129 try_switch:
 2130                         if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
 2131                             (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 2132                                 if (sc->sc_current_count > 0) {
 2133                                         mtx_unlock(&sc->sc_mtx);
 2134                                         g_journal_flush(sc);
 2135                                         g_journal_flush_send(sc);
 2136                                         continue;
 2137                                 }
 2138                                 if (sc->sc_flush_in_progress > 0)
 2139                                         goto sleep;
 2140                                 if (sc->sc_copy_in_progress > 0)
 2141                                         goto sleep;
 2142                         }
 2143                         if (sc->sc_flags & GJF_DEVICE_SWITCH) {
 2144                                 mtx_unlock(&sc->sc_mtx);
 2145                                 g_journal_switch(sc);
 2146                                 wakeup(&sc->sc_journal_copying);
 2147                                 continue;
 2148                         }
 2149                         if (sc->sc_flags & GJF_DEVICE_DESTROY) {
 2150                                 GJ_DEBUG(1, "Shutting down worker "
 2151                                     "thread for %s.", gp->name);
 2152                                 sc->sc_worker = NULL;
 2153                                 wakeup(&sc->sc_worker);
 2154                                 mtx_unlock(&sc->sc_mtx);
 2155                                 kthread_exit(0);
 2156                         }
 2157 sleep:
 2158                         g_journal_wait(sc, last_write);
 2159                         continue;
 2160                 }
 2161                 /*
 2162                  * If we're in switch process, we need to delay all new
 2163                  * write requests until its done.
 2164                  */
 2165                 if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
 2166                     type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
 2167                         GJ_LOGREQ(2, bp, "WRITE on SWITCH");
 2168                         goto try_switch;
 2169                 }
 2170                 if (type == GJ_BIO_REGULAR)
 2171                         bioq_remove(&sc->sc_regular_queue, bp);
 2172                 else
 2173                         bioq_remove(&sc->sc_back_queue, bp);
 2174                 mtx_unlock(&sc->sc_mtx);
 2175                 switch (type) {
 2176                 case GJ_BIO_REGULAR:
 2177                         /* Regular request. */
 2178                         switch (bp->bio_cmd) {
 2179                         case BIO_READ:
 2180                                 g_journal_read(sc, bp, bp->bio_offset,
 2181                                     bp->bio_offset + bp->bio_length);
 2182                                 break;
 2183                         case BIO_WRITE:
 2184                                 last_write = time_second;
 2185                                 g_journal_add_request(sc, bp);
 2186                                 g_journal_flush_send(sc);
 2187                                 break;
 2188                         default:
 2189                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 2190                         }
 2191                         break;
 2192                 case GJ_BIO_COPY:
 2193                         switch (bp->bio_cmd) {
 2194                         case BIO_READ:
 2195                                 if (g_journal_copy_read_done(bp))
 2196                                         g_journal_copy_send(sc);
 2197                                 break;
 2198                         case BIO_WRITE:
 2199                                 g_journal_copy_write_done(bp);
 2200                                 g_journal_copy_send(sc);
 2201                                 break;
 2202                         default:
 2203                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 2204                         }
 2205                         break;
 2206                 case GJ_BIO_JOURNAL:
 2207                         g_journal_flush_done(bp);
 2208                         g_journal_flush_send(sc);
 2209                         break;
 2210                 case GJ_BIO_READ:
 2211                 default:
 2212                         panic("Invalid bio (%d).", type);
 2213                 }
 2214         }
 2215 }
 2216 
 2217 static void
 2218 g_journal_destroy_event(void *arg, int flags __unused)
 2219 {
 2220         struct g_journal_softc *sc;
 2221 
 2222         g_topology_assert();
 2223         sc = arg;
 2224         g_journal_destroy(sc);
 2225 }
 2226 
 2227 static void
 2228 g_journal_timeout(void *arg)
 2229 {
 2230         struct g_journal_softc *sc;
 2231 
 2232         sc = arg;
 2233         GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
 2234             sc->sc_geom->name);
 2235         g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
 2236 }
 2237 
 2238 static struct g_geom *
 2239 g_journal_create(struct g_class *mp, struct g_provider *pp,
 2240     const struct g_journal_metadata *md)
 2241 {
 2242         struct g_journal_softc *sc;
 2243         struct g_geom *gp;
 2244         struct g_consumer *cp;
 2245         int error;
 2246 
 2247         sc = NULL;      /* gcc */
 2248 
 2249         g_topology_assert();
 2250         /*
 2251          * There are two possibilities:
 2252          * 1. Data and both journals are on the same provider.
 2253          * 2. Data and journals are all on separated providers.
 2254          */
 2255         /* Look for journal device with the same ID. */
 2256         LIST_FOREACH(gp, &mp->geom, geom) {
 2257                 sc = gp->softc;
 2258                 if (sc == NULL)
 2259                         continue;
 2260                 if (sc->sc_id == md->md_id)
 2261                         break;
 2262         }
 2263         if (gp == NULL)
 2264                 sc = NULL;
 2265         else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
 2266                 GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
 2267                 return (NULL);
 2268         }
 2269         if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
 2270                 GJ_DEBUG(0, "Invalid type on %s.", pp->name);
 2271                 return (NULL);
 2272         }
 2273         if (md->md_type & GJ_TYPE_DATA) {
 2274                 GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
 2275                     pp->name);
 2276         }
 2277         if (md->md_type & GJ_TYPE_JOURNAL) {
 2278                 GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
 2279                     pp->name);
 2280         }
 2281 
 2282         if (sc == NULL) {
 2283                 /* Action geom. */
 2284                 sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
 2285                 sc->sc_id = md->md_id;
 2286                 sc->sc_type = 0;
 2287                 sc->sc_flags = 0;
 2288                 sc->sc_worker = NULL;
 2289 
 2290                 gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
 2291                 gp->start = g_journal_start;
 2292                 gp->orphan = g_journal_orphan;
 2293                 gp->access = g_journal_access;
 2294                 gp->softc = sc;
 2295                 sc->sc_geom = gp;
 2296 
 2297                 mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
 2298 
 2299                 bioq_init(&sc->sc_back_queue);
 2300                 bioq_init(&sc->sc_regular_queue);
 2301                 bioq_init(&sc->sc_delayed_queue);
 2302                 sc->sc_delayed_count = 0;
 2303                 sc->sc_current_queue = NULL;
 2304                 sc->sc_current_count = 0;
 2305                 sc->sc_flush_queue = NULL;
 2306                 sc->sc_flush_count = 0;
 2307                 sc->sc_flush_in_progress = 0;
 2308                 sc->sc_copy_queue = NULL;
 2309                 sc->sc_copy_in_progress = 0;
 2310                 sc->sc_inactive.jj_queue = NULL;
 2311                 sc->sc_active.jj_queue = NULL;
 2312 
 2313                 sc->sc_rootmount = root_mount_hold("GJOURNAL");
 2314                 GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 2315 
 2316                 callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
 2317                 if (md->md_type != GJ_TYPE_COMPLETE) {
 2318                         /*
 2319                          * Journal and data are on separate providers.
 2320                          * At this point we have only one of them.
 2321                          * We setup a timeout in case the other part will not
 2322                          * appear, so we won't wait forever.
 2323                          */
 2324                         callout_reset(&sc->sc_callout, 5 * hz,
 2325                             g_journal_timeout, sc);
 2326                 }
 2327         }
 2328 
 2329         /* Remember type of the data provider. */
 2330         if (md->md_type & GJ_TYPE_DATA)
 2331                 sc->sc_orig_type = md->md_type;
 2332         sc->sc_type |= md->md_type;
 2333         cp = NULL;
 2334 
 2335         if (md->md_type & GJ_TYPE_DATA) {
 2336                 if (md->md_flags & GJ_FLAG_CLEAN)
 2337                         sc->sc_flags |= GJF_DEVICE_CLEAN;
 2338                 if (md->md_flags & GJ_FLAG_CHECKSUM)
 2339                         sc->sc_flags |= GJF_DEVICE_CHECKSUM;
 2340                 cp = g_new_consumer(gp);
 2341                 error = g_attach(cp, pp);
 2342                 KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 2343                     pp->name, error));
 2344                 error = g_access(cp, 1, 1, 1);
 2345                 if (error != 0) {
 2346                         GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
 2347                             error);
 2348                         g_journal_destroy(sc);
 2349                         return (NULL);
 2350                 }
 2351                 sc->sc_dconsumer = cp;
 2352                 sc->sc_mediasize = pp->mediasize - pp->sectorsize;
 2353                 sc->sc_sectorsize = pp->sectorsize;
 2354                 sc->sc_jstart = md->md_jstart;
 2355                 sc->sc_jend = md->md_jend;
 2356                 if (md->md_provider[0] != '\0')
 2357                         sc->sc_flags |= GJF_DEVICE_HARDCODED;
 2358                 sc->sc_journal_offset = md->md_joffset;
 2359                 sc->sc_journal_id = md->md_jid;
 2360                 sc->sc_journal_previous_id = md->md_jid;
 2361         }
 2362         if (md->md_type & GJ_TYPE_JOURNAL) {
 2363                 if (cp == NULL) {
 2364                         cp = g_new_consumer(gp);
 2365                         error = g_attach(cp, pp);
 2366                         KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 2367                             pp->name, error));
 2368                         error = g_access(cp, 1, 1, 1);
 2369                         if (error != 0) {
 2370                                 GJ_DEBUG(0, "Cannot access %s (error=%d).",
 2371                                     pp->name, error);
 2372                                 g_journal_destroy(sc);
 2373                                 return (NULL);
 2374                         }
 2375                 } else {
 2376                         /*
 2377                          * Journal is on the same provider as data, which means
 2378                          * that data provider ends where journal starts.
 2379                          */
 2380                         sc->sc_mediasize = md->md_jstart;
 2381                 }
 2382                 sc->sc_jconsumer = cp;
 2383         }
 2384 
 2385         if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
 2386                 /* Journal is not complete yet. */
 2387                 return (gp);
 2388         } else {
 2389                 /* Journal complete, cancel timeout. */
 2390                 callout_drain(&sc->sc_callout);
 2391         }
 2392 
 2393         error = kthread_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
 2394             "g_journal %s", sc->sc_name);
 2395         if (error != 0) {
 2396                 GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
 2397                     sc->sc_name);
 2398                 g_journal_destroy(sc);
 2399                 return (NULL);
 2400         }
 2401 
 2402         return (gp);
 2403 }
 2404 
 2405 static void
 2406 g_journal_destroy_consumer(void *arg, int flags __unused)
 2407 {
 2408         struct g_consumer *cp;
 2409 
 2410         g_topology_assert();
 2411         cp = arg;
 2412         g_detach(cp);
 2413         g_destroy_consumer(cp);
 2414 }
 2415 
 2416 static int
 2417 g_journal_destroy(struct g_journal_softc *sc)
 2418 {
 2419         struct g_geom *gp;
 2420         struct g_provider *pp;
 2421         struct g_consumer *cp;
 2422 
 2423         g_topology_assert();
 2424 
 2425         if (sc == NULL)
 2426                 return (ENXIO);
 2427 
 2428         gp = sc->sc_geom;
 2429         pp = LIST_FIRST(&gp->provider);
 2430         if (pp != NULL) {
 2431                 if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 2432                         GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
 2433                             pp->name, pp->acr, pp->acw, pp->ace);
 2434                         return (EBUSY);
 2435                 }
 2436                 g_error_provider(pp, ENXIO);
 2437 
 2438                 g_journal_flush(sc);
 2439                 g_journal_flush_send(sc);
 2440                 g_journal_switch(sc);
 2441         }
 2442 
 2443         sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
 2444 
 2445         g_topology_unlock();
 2446 
 2447         if (sc->sc_rootmount != NULL) {
 2448                 GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 2449                 root_mount_rel(sc->sc_rootmount);
 2450                 sc->sc_rootmount = NULL;
 2451         }
 2452 
 2453         callout_drain(&sc->sc_callout);
 2454         mtx_lock(&sc->sc_mtx);
 2455         wakeup(sc);
 2456         while (sc->sc_worker != NULL)
 2457                 msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
 2458         mtx_unlock(&sc->sc_mtx);
 2459 
 2460         if (pp != NULL) {
 2461                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 2462                 g_journal_metadata_update(sc);
 2463                 g_topology_lock();
 2464                 pp->flags |= G_PF_WITHER;
 2465                 g_orphan_provider(pp, ENXIO);
 2466         } else {
 2467                 g_topology_lock();
 2468         }
 2469         mtx_destroy(&sc->sc_mtx);
 2470 
 2471         if (sc->sc_current_count != 0) {
 2472                 GJ_DEBUG(0, "Warning! Number of current requests %d.",
 2473                     sc->sc_current_count);
 2474         }
 2475 
 2476         LIST_FOREACH(cp, &gp->consumer, consumer) {
 2477                 if (cp->acr + cp->acw + cp->ace > 0)
 2478                         g_access(cp, -1, -1, -1);
 2479                 /*
 2480                  * We keep all consumers open for writting, so if I'll detach
 2481                  * and destroy consumer here, I'll get providers for taste, so
 2482                  * journal will be started again.
 2483                  * Sending an event here, prevents this from happening.
 2484                  */
 2485                 g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
 2486         }
 2487         gp->softc = NULL;
 2488         g_wither_geom(gp, ENXIO);
 2489         free(sc, M_JOURNAL);
 2490         return (0);
 2491 }
 2492 
 2493 static void
 2494 g_journal_taste_orphan(struct g_consumer *cp)
 2495 {
 2496 
 2497         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 2498             cp->provider->name));
 2499 }
 2500 
 2501 static struct g_geom *
 2502 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 2503 {
 2504         struct g_journal_metadata md;
 2505         struct g_consumer *cp;
 2506         struct g_geom *gp;
 2507         int error;
 2508 
 2509         g_topology_assert();
 2510         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 2511         GJ_DEBUG(2, "Tasting %s.", pp->name);
 2512         if (pp->geom->class == mp)
 2513                 return (NULL);
 2514 
 2515         gp = g_new_geomf(mp, "journal:taste");
 2516         /* This orphan function should be never called. */
 2517         gp->orphan = g_journal_taste_orphan;
 2518         cp = g_new_consumer(gp);
 2519         g_attach(cp, pp);
 2520         error = g_journal_metadata_read(cp, &md);
 2521         g_detach(cp);
 2522         g_destroy_consumer(cp);
 2523         g_destroy_geom(gp);
 2524         if (error != 0)
 2525                 return (NULL);
 2526         gp = NULL;
 2527 
 2528         if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
 2529                 return (NULL);
 2530         if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 2531                 return (NULL);
 2532         if (g_journal_debug >= 2)
 2533                 journal_metadata_dump(&md);
 2534 
 2535         gp = g_journal_create(mp, pp, &md);
 2536         return (gp);
 2537 }
 2538 
 2539 static struct g_journal_softc *
 2540 g_journal_find_device(struct g_class *mp, const char *name)
 2541 {
 2542         struct g_journal_softc *sc;
 2543         struct g_geom *gp;
 2544         struct g_provider *pp;
 2545 
 2546         if (strncmp(name, "/dev/", 5) == 0)
 2547                 name += 5;
 2548         LIST_FOREACH(gp, &mp->geom, geom) {
 2549                 sc = gp->softc;
 2550                 if (sc == NULL)
 2551                         continue;
 2552                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
 2553                         continue;
 2554                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 2555                         continue;
 2556                 pp = LIST_FIRST(&gp->provider);
 2557                 if (strcmp(sc->sc_name, name) == 0)
 2558                         return (sc);
 2559                 if (pp != NULL && strcmp(pp->name, name) == 0)
 2560                         return (sc);
 2561         }
 2562         return (NULL);
 2563 }
 2564 
 2565 static void
 2566 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 2567 {
 2568         struct g_journal_softc *sc;
 2569         const char *name;
 2570         char param[16];
 2571         int *nargs;
 2572         int error, i;
 2573 
 2574         g_topology_assert();
 2575 
 2576         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 2577         if (nargs == NULL) {
 2578                 gctl_error(req, "No '%s' argument.", "nargs");
 2579                 return;
 2580         }
 2581         if (*nargs <= 0) {
 2582                 gctl_error(req, "Missing device(s).");
 2583                 return;
 2584         }
 2585 
 2586         for (i = 0; i < *nargs; i++) {
 2587                 snprintf(param, sizeof(param), "arg%d", i);
 2588                 name = gctl_get_asciiparam(req, param);
 2589                 if (name == NULL) {
 2590                         gctl_error(req, "No 'arg%d' argument.", i);
 2591                         return;
 2592                 }
 2593                 sc = g_journal_find_device(mp, name);
 2594                 if (sc == NULL) {
 2595                         gctl_error(req, "No such device: %s.", name);
 2596                         return;
 2597                 }
 2598                 error = g_journal_destroy(sc);
 2599                 if (error != 0) {
 2600                         gctl_error(req, "Cannot destroy device %s (error=%d).",
 2601                             LIST_FIRST(&sc->sc_geom->provider)->name, error);
 2602                         return;
 2603                 }
 2604         }
 2605 }
 2606 
 2607 static void
 2608 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
 2609 {
 2610 
 2611         g_topology_assert();
 2612         g_topology_unlock();
 2613         g_journal_sync_requested++;
 2614         wakeup(&g_journal_switcher_state);
 2615         while (g_journal_sync_requested > 0)
 2616                 tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
 2617         g_topology_lock();
 2618 }
 2619 
 2620 static void
 2621 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 2622 {
 2623         uint32_t *version;
 2624 
 2625         g_topology_assert();
 2626 
 2627         version = gctl_get_paraml(req, "version", sizeof(*version));
 2628         if (version == NULL) {
 2629                 gctl_error(req, "No '%s' argument.", "version");
 2630                 return;
 2631         }
 2632         if (*version != G_JOURNAL_VERSION) {
 2633                 gctl_error(req, "Userland and kernel parts are out of sync.");
 2634                 return;
 2635         }
 2636 
 2637         if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
 2638                 g_journal_ctl_destroy(req, mp);
 2639                 return;
 2640         } else if (strcmp(verb, "sync") == 0) {
 2641                 g_journal_ctl_sync(req, mp);
 2642                 return;
 2643         }
 2644 
 2645         gctl_error(req, "Unknown verb.");
 2646 }
 2647 
 2648 static void
 2649 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 2650     struct g_consumer *cp, struct g_provider *pp)
 2651 {
 2652         struct g_journal_softc *sc;
 2653 
 2654         g_topology_assert();
 2655 
 2656         sc = gp->softc;
 2657         if (sc == NULL)
 2658                 return;
 2659         if (pp != NULL) {
 2660                 /* Nothing here. */
 2661         } else if (cp != NULL) {
 2662                 int first = 1;
 2663 
 2664                 sbuf_printf(sb, "%s<Role>", indent);
 2665                 if (cp == sc->sc_dconsumer) {
 2666                         sbuf_printf(sb, "Data");
 2667                         first = 0;
 2668                 }
 2669                 if (cp == sc->sc_jconsumer) {
 2670                         if (!first)
 2671                                 sbuf_printf(sb, ",");
 2672                         sbuf_printf(sb, "Journal");
 2673                 }
 2674                 sbuf_printf(sb, "</Role>\n");
 2675                 if (cp == sc->sc_jconsumer) {
 2676                         sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
 2677                             (intmax_t)sc->sc_jstart);
 2678                         sbuf_printf(sb, "<Jend>%jd</Jend>\n",
 2679                             (intmax_t)sc->sc_jend);
 2680                 }
 2681         } else {
 2682                 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 2683         }
 2684 }
 2685 
 2686 static eventhandler_tag g_journal_event_shutdown = NULL;
 2687 static eventhandler_tag g_journal_event_lowmem = NULL;
 2688 
 2689 static void
 2690 g_journal_shutdown(void *arg, int howto __unused)
 2691 {
 2692         struct g_class *mp;
 2693         struct g_geom *gp, *gp2;
 2694 
 2695         if (panicstr != NULL)
 2696                 return;
 2697         mp = arg;
 2698         DROP_GIANT();
 2699         g_topology_lock();
 2700         LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 2701                 if (gp->softc == NULL)
 2702                         continue;
 2703                 GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
 2704                 g_journal_destroy(gp->softc);
 2705         }
 2706         g_topology_unlock();
 2707         PICKUP_GIANT();
 2708 }
 2709 
 2710 /*
 2711  * Free cached requests from inactive queue in case of low memory.
 2712  * We free GJ_FREE_AT_ONCE elements at once.
 2713  */
 2714 #define GJ_FREE_AT_ONCE 4
 2715 static void
 2716 g_journal_lowmem(void *arg, int howto __unused)
 2717 {
 2718         struct g_journal_softc *sc;
 2719         struct g_class *mp;
 2720         struct g_geom *gp;
 2721         struct bio *bp;
 2722         u_int nfree = GJ_FREE_AT_ONCE;
 2723 
 2724         g_journal_stats_low_mem++;
 2725         mp = arg;
 2726         DROP_GIANT();
 2727         g_topology_lock();
 2728         LIST_FOREACH(gp, &mp->geom, geom) {
 2729                 sc = gp->softc;
 2730                 if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
 2731                         continue;
 2732                 mtx_lock(&sc->sc_mtx);
 2733                 for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
 2734                     nfree--, bp = bp->bio_next) {
 2735                         /*
 2736                          * This is safe to free the bio_data, because:
 2737                          * 1. If bio_data is NULL it will be read from the
 2738                          *    inactive journal.
 2739                          * 2. If bp is sent down, it is first removed from the
 2740                          *    inactive queue, so it's impossible to free the
 2741                          *    data from under in-flight bio.
 2742                          * On the other hand, freeing elements from the active
 2743                          * queue, is not safe.
 2744                          */
 2745                         if (bp->bio_data != NULL) {
 2746                                 GJ_DEBUG(2, "Freeing data from %s.",
 2747                                     sc->sc_name);
 2748                                 gj_free(bp->bio_data, bp->bio_length);
 2749                                 bp->bio_data = NULL;
 2750                         }
 2751                 }
 2752                 mtx_unlock(&sc->sc_mtx);
 2753                 if (nfree == 0)
 2754                         break;
 2755         }
 2756         g_topology_unlock();
 2757         PICKUP_GIANT();
 2758 }
 2759 
 2760 static void g_journal_switcher(void *arg);
 2761 
 2762 static void
 2763 g_journal_init(struct g_class *mp)
 2764 {
 2765         int error;
 2766 
 2767         /* Pick a conservative value if provided value sucks. */
 2768         if (g_journal_cache_divisor <= 0 ||
 2769             (vm_kmem_size / g_journal_cache_divisor == 0)) {
 2770                 g_journal_cache_divisor = 5;
 2771         }
 2772         if (g_journal_cache_limit > 0) {
 2773                 g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
 2774                 g_journal_cache_low =
 2775                     (g_journal_cache_limit / 100) * g_journal_cache_switch;
 2776         }
 2777         g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
 2778             g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
 2779         if (g_journal_event_shutdown == NULL)
 2780                 GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
 2781         g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
 2782             g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
 2783         if (g_journal_event_lowmem == NULL)
 2784                 GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
 2785         error = kthread_create(g_journal_switcher, mp, NULL, 0, 0,
 2786             "g_journal switcher");
 2787         KASSERT(error == 0, ("Cannot create switcher thread."));
 2788 }
 2789 
 2790 static void
 2791 g_journal_fini(struct g_class *mp)
 2792 {
 2793 
 2794         if (g_journal_event_shutdown != NULL) {
 2795                 EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 2796                     g_journal_event_shutdown);
 2797         }
 2798         if (g_journal_event_lowmem != NULL)
 2799                 EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
 2800         g_journal_switcher_state = GJ_SWITCHER_DIE;
 2801         wakeup(&g_journal_switcher_state);
 2802         while (g_journal_switcher_state != GJ_SWITCHER_DIED)
 2803                 tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
 2804         GJ_DEBUG(1, "Switcher died.");
 2805 }
 2806 
 2807 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
 2808 
 2809 static const struct g_journal_desc *
 2810 g_journal_find_desc(const char *fstype)
 2811 {
 2812         const struct g_journal_desc *desc;
 2813         int i;
 2814 
 2815         for (desc = g_journal_filesystems[i = 0]; desc != NULL;
 2816              desc = g_journal_filesystems[++i]) {
 2817                 if (strcmp(desc->jd_fstype, fstype) == 0)
 2818                         break;
 2819         }
 2820         return (desc);
 2821 }
 2822 
 2823 static void
 2824 g_journal_switch_wait(struct g_journal_softc *sc)
 2825 {
 2826         struct bintime bt;
 2827 
 2828         mtx_assert(&sc->sc_mtx, MA_OWNED);
 2829         if (g_journal_debug >= 2) {
 2830                 if (sc->sc_flush_in_progress > 0) {
 2831                         GJ_DEBUG(2, "%d requests flushing.",
 2832                             sc->sc_flush_in_progress);
 2833                 }
 2834                 if (sc->sc_copy_in_progress > 0) {
 2835                         GJ_DEBUG(2, "%d requests copying.",
 2836                             sc->sc_copy_in_progress);
 2837                 }
 2838                 if (sc->sc_flush_count > 0) {
 2839                         GJ_DEBUG(2, "%d requests to flush.",
 2840                             sc->sc_flush_count);
 2841                 }
 2842                 if (sc->sc_delayed_count > 0) {
 2843                         GJ_DEBUG(2, "%d requests delayed.",
 2844                             sc->sc_delayed_count);
 2845                 }
 2846         }
 2847         g_journal_stats_switches++;
 2848         if (sc->sc_copy_in_progress > 0)
 2849                 g_journal_stats_wait_for_copy++;
 2850         GJ_TIMER_START(1, &bt);
 2851         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 2852         sc->sc_flags |= GJF_DEVICE_SWITCH;
 2853         wakeup(sc);
 2854         while (sc->sc_flags & GJF_DEVICE_SWITCH) {
 2855                 msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
 2856                     "gj:switch", 0);
 2857         }
 2858         GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
 2859 }
 2860 
 2861 static void
 2862 g_journal_do_switch(struct g_class *classp, struct thread *td)
 2863 {
 2864         struct g_journal_softc *sc;
 2865         const struct g_journal_desc *desc;
 2866         struct g_geom *gp;
 2867         struct mount *mp;
 2868         struct bintime bt;
 2869         char *mountpoint;
 2870         int error, vfslocked;
 2871 
 2872         DROP_GIANT();
 2873         g_topology_lock();
 2874         LIST_FOREACH(gp, &classp->geom, geom) {
 2875                 sc = gp->softc;
 2876                 if (sc == NULL)
 2877                         continue;
 2878                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
 2879                         continue;
 2880                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 2881                         continue;
 2882                 mtx_lock(&sc->sc_mtx);
 2883                 sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
 2884                 mtx_unlock(&sc->sc_mtx);
 2885         }
 2886         g_topology_unlock();
 2887         PICKUP_GIANT();
 2888 
 2889         mtx_lock(&mountlist_mtx);
 2890         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 2891                 if (mp->mnt_gjprovider == NULL)
 2892                         continue;
 2893                 if (mp->mnt_flag & MNT_RDONLY)
 2894                         continue;
 2895                 desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
 2896                 if (desc == NULL)
 2897                         continue;
 2898                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 2899                         continue;
 2900                 /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
 2901 
 2902                 DROP_GIANT();
 2903                 g_topology_lock();
 2904                 sc = g_journal_find_device(classp, mp->mnt_gjprovider);
 2905                 g_topology_unlock();
 2906                 PICKUP_GIANT();
 2907 
 2908                 if (sc == NULL) {
 2909                         GJ_DEBUG(0, "Cannot find journal geom for %s.",
 2910                             mp->mnt_gjprovider);
 2911                         goto next;
 2912                 } else if (JEMPTY(sc)) {
 2913                         mtx_lock(&sc->sc_mtx);
 2914                         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 2915                         mtx_unlock(&sc->sc_mtx);
 2916                         GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 2917                         goto next;
 2918                 }
 2919 
 2920                 mountpoint = mp->mnt_stat.f_mntonname;
 2921 
 2922                 vfslocked = VFS_LOCK_GIANT(mp);
 2923 
 2924                 error = vn_start_write(NULL, &mp, V_WAIT);
 2925                 if (error != 0) {
 2926                         VFS_UNLOCK_GIANT(vfslocked);
 2927                         GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
 2928                             mountpoint, error);
 2929                         goto next;
 2930                 }
 2931 
 2932                 MNT_ILOCK(mp);
 2933                 mp->mnt_noasync++;
 2934                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
 2935                 MNT_IUNLOCK(mp);
 2936 
 2937                 GJ_TIMER_START(1, &bt);
 2938                 vfs_msync(mp, MNT_NOWAIT);
 2939                 GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
 2940 
 2941                 GJ_TIMER_START(1, &bt);
 2942                 error = VFS_SYNC(mp, MNT_NOWAIT, curthread);
 2943                 if (error == 0)
 2944                         GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
 2945                 else {
 2946                         GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
 2947                             mountpoint, error);
 2948                 }
 2949 
 2950                 MNT_ILOCK(mp);
 2951                 mp->mnt_noasync--;
 2952                 if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 2953                         mp->mnt_kern_flag |= MNTK_ASYNC;
 2954                 MNT_IUNLOCK(mp);
 2955 
 2956                 vn_finished_write(mp);
 2957 
 2958                 if (error != 0) {
 2959                         VFS_UNLOCK_GIANT(vfslocked);
 2960                         goto next;
 2961                 }
 2962 
 2963                 /*
 2964                  * Send BIO_FLUSH before freezing the file system, so it can be
 2965                  * faster after the freeze.
 2966                  */
 2967                 GJ_TIMER_START(1, &bt);
 2968                 g_journal_flush_cache(sc);
 2969                 GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
 2970 
 2971                 GJ_TIMER_START(1, &bt);
 2972                 error = vfs_write_suspend(mp);
 2973                 VFS_UNLOCK_GIANT(vfslocked);
 2974                 GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
 2975                 if (error != 0) {
 2976                         GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
 2977                             mountpoint, error);
 2978                         goto next;
 2979                 }
 2980 
 2981                 error = desc->jd_clean(mp);
 2982                 if (error != 0)
 2983                         goto next;
 2984 
 2985                 mtx_lock(&sc->sc_mtx);
 2986                 g_journal_switch_wait(sc);
 2987                 mtx_unlock(&sc->sc_mtx);
 2988 
 2989                 vfs_write_resume(mp);
 2990 next:
 2991                 mtx_lock(&mountlist_mtx);
 2992                 vfs_unbusy(mp, td);
 2993         }
 2994         mtx_unlock(&mountlist_mtx);
 2995 
 2996         sc = NULL;
 2997         for (;;) {
 2998                 DROP_GIANT();
 2999                 g_topology_lock();
 3000                 LIST_FOREACH(gp, &g_journal_class.geom, geom) {
 3001                         sc = gp->softc;
 3002                         if (sc == NULL)
 3003                                 continue;
 3004                         mtx_lock(&sc->sc_mtx);
 3005                         if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
 3006                             !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
 3007                             (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
 3008                                 break;
 3009                         }
 3010                         mtx_unlock(&sc->sc_mtx);
 3011                         sc = NULL;
 3012                 }
 3013                 g_topology_unlock();
 3014                 PICKUP_GIANT();
 3015                 if (sc == NULL)
 3016                         break;
 3017                 mtx_assert(&sc->sc_mtx, MA_OWNED);
 3018                 g_journal_switch_wait(sc);
 3019                 mtx_unlock(&sc->sc_mtx);
 3020         }
 3021 }
 3022 
 3023 /*
 3024  * TODO: Switcher thread should be started on first geom creation and killed on
 3025  * last geom destruction.
 3026  */
 3027 static void
 3028 g_journal_switcher(void *arg)
 3029 {
 3030         struct thread *td = curthread;
 3031         struct g_class *mp;
 3032         struct bintime bt;
 3033         int error;
 3034 
 3035         mp = arg;
 3036         for (;;) {
 3037                 g_journal_switcher_wokenup = 0;
 3038                 error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
 3039                     g_journal_switch_time * hz);
 3040                 if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
 3041                         g_journal_switcher_state = GJ_SWITCHER_DIED;
 3042                         GJ_DEBUG(1, "Switcher exiting.");
 3043                         wakeup(&g_journal_switcher_state);
 3044                         kthread_exit(0);
 3045                 }
 3046                 if (error == 0 && g_journal_sync_requested == 0) {
 3047                         GJ_DEBUG(1, "Out of cache, force switch (used=%u "
 3048                             "limit=%u).", g_journal_cache_used,
 3049                             g_journal_cache_limit);
 3050                 }
 3051                 GJ_TIMER_START(1, &bt);
 3052                 g_journal_do_switch(mp, td);
 3053                 GJ_TIMER_STOP(1, &bt, "Entire switch time");
 3054                 if (g_journal_sync_requested > 0) {
 3055                         g_journal_sync_requested = 0;
 3056                         wakeup(&g_journal_sync_requested);
 3057                 }
 3058         }
 3059 }

Cache object: 078d8f0ea9ac2a53188a650339f3a90d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.