The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/geom/journal/g_journal.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD$");
   29 
   30 #include <sys/param.h>
   31 #include <sys/systm.h>
   32 #include <sys/kernel.h>
   33 #include <sys/module.h>
   34 #include <sys/limits.h>
   35 #include <sys/lock.h>
   36 #include <sys/mutex.h>
   37 #include <sys/bio.h>
   38 #include <sys/sysctl.h>
   39 #include <sys/malloc.h>
   40 #include <sys/mount.h>
   41 #include <sys/eventhandler.h>
   42 #include <sys/proc.h>
   43 #include <sys/kthread.h>
   44 #include <sys/sched.h>
   45 #include <sys/taskqueue.h>
   46 #include <sys/vnode.h>
   47 #include <sys/sbuf.h>
   48 #ifdef GJ_MEMDEBUG
   49 #include <sys/stack.h>
   50 #include <sys/kdb.h>
   51 #endif
   52 #include <vm/vm.h>
   53 #include <vm/vm_kern.h>
   54 #include <geom/geom.h>
   55 
   56 #include <geom/journal/g_journal.h>
   57 
   58 
   59 /*
   60  * On-disk journal format:
   61  *
   62  * JH - Journal header
   63  * RH - Record header
   64  *
   65  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
   66  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
   67  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
   68  *
   69  */
   70 
   71 CTASSERT(sizeof(struct g_journal_header) <= 512);
   72 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
   73 
   74 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
   75 static struct mtx g_journal_cache_mtx;
   76 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
   77 
   78 const struct g_journal_desc *g_journal_filesystems[] = {
   79         &g_journal_ufs,
   80         NULL
   81 };
   82 
   83 SYSCTL_DECL(_kern_geom);
   84 
   85 int g_journal_debug = 0;
   86 TUNABLE_INT("kern.geom.journal.debug", &g_journal_debug);
   87 static u_int g_journal_switch_time = 10;
   88 static u_int g_journal_force_switch = 70;
   89 static u_int g_journal_parallel_flushes = 16;
   90 static u_int g_journal_parallel_copies = 16;
   91 static u_int g_journal_accept_immediately = 64;
   92 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
   93 static u_int g_journal_do_optimize = 1;
   94 
   95 SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, "GEOM_JOURNAL stuff");
   96 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RW, &g_journal_debug, 0,
   97     "Debug level");
   98 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
   99     &g_journal_switch_time, 0, "Switch journals every N seconds");
  100 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
  101     &g_journal_force_switch, 0, "Force switch when journal is N% full");
  102 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
  103     &g_journal_parallel_flushes, 0,
  104     "Number of flush I/O requests to send in parallel");
  105 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
  106     &g_journal_accept_immediately, 0,
  107     "Number of I/O requests accepted immediately");
  108 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
  109     &g_journal_parallel_copies, 0,
  110     "Number of copy I/O requests to send in parallel");
  111 static int
  112 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
  113 {
  114         u_int entries;
  115         int error;
  116 
  117         entries = g_journal_record_entries;
  118         error = sysctl_handle_int(oidp, &entries, 0, req);
  119         if (error != 0 || req->newptr == NULL)
  120                 return (error);
  121         if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
  122                 return (EINVAL);
  123         g_journal_record_entries = entries;
  124         return (0);
  125 }
  126 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
  127     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
  128     "Maximum number of entires in one journal record");
  129 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
  130     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
  131 
  132 static u_int g_journal_cache_used = 0;
  133 static u_int g_journal_cache_limit = 64 * 1024 * 1024;
  134 TUNABLE_INT("kern.geom.journal.cache.limit", &g_journal_cache_limit);
  135 static u_int g_journal_cache_divisor = 2;
  136 TUNABLE_INT("kern.geom.journal.cache.divisor", &g_journal_cache_divisor);
  137 static u_int g_journal_cache_switch = 90;
  138 static u_int g_journal_cache_misses = 0;
  139 static u_int g_journal_cache_alloc_failures = 0;
  140 static u_int g_journal_cache_low = 0;
  141 
  142 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
  143     "GEOM_JOURNAL cache");
  144 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
  145     &g_journal_cache_used, 0, "Number of allocated bytes");
  146 static int
  147 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
  148 {
  149         u_int limit;
  150         int error;
  151 
  152         limit = g_journal_cache_limit;
  153         error = sysctl_handle_int(oidp, &limit, 0, req);
  154         if (error != 0 || req->newptr == NULL)
  155                 return (error);
  156         g_journal_cache_limit = limit;
  157         g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
  158         return (0);
  159 }
  160 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
  161     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_limit_sysctl, "I",
  162     "Maximum number of allocated bytes");
  163 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
  164     &g_journal_cache_divisor, 0,
  165     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
  166 static int
  167 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
  168 {
  169         u_int cswitch;
  170         int error;
  171 
  172         cswitch = g_journal_cache_switch;
  173         error = sysctl_handle_int(oidp, &cswitch, 0, req);
  174         if (error != 0 || req->newptr == NULL)
  175                 return (error);
  176         if (cswitch < 0 || cswitch > 100)
  177                 return (EINVAL);
  178         g_journal_cache_switch = cswitch;
  179         g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
  180         return (0);
  181 }
  182 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
  183     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
  184     "Force switch when we hit this percent of cache use");
  185 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
  186     &g_journal_cache_misses, 0, "Number of cache misses");
  187 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
  188     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
  189 
  190 static u_long g_journal_stats_bytes_skipped = 0;
  191 static u_long g_journal_stats_combined_ios = 0;
  192 static u_long g_journal_stats_switches = 0;
  193 static u_long g_journal_stats_wait_for_copy = 0;
  194 static u_long g_journal_stats_journal_full = 0;
  195 static u_long g_journal_stats_low_mem = 0;
  196 
  197 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
  198     "GEOM_JOURNAL statistics");
  199 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
  200     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
  201 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
  202     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
  203 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
  204     &g_journal_stats_switches, 0, "Number of journal switches");
  205 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
  206     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
  207 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
  208     &g_journal_stats_journal_full, 0,
  209     "Number of times journal was almost full.");
  210 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
  211     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
  212 
  213 static g_taste_t g_journal_taste;
  214 static g_ctl_req_t g_journal_config;
  215 static g_dumpconf_t g_journal_dumpconf;
  216 static g_init_t g_journal_init;
  217 static g_fini_t g_journal_fini;
  218 
  219 struct g_class g_journal_class = {
  220         .name = G_JOURNAL_CLASS_NAME,
  221         .version = G_VERSION,
  222         .taste = g_journal_taste,
  223         .ctlreq = g_journal_config,
  224         .dumpconf = g_journal_dumpconf,
  225         .init = g_journal_init,
  226         .fini = g_journal_fini
  227 };
  228 
  229 static int g_journal_destroy(struct g_journal_softc *sc);
  230 static void g_journal_metadata_update(struct g_journal_softc *sc);
  231 static void g_journal_switch_wait(struct g_journal_softc *sc);
  232 
  233 #define GJ_SWITCHER_WORKING     0
  234 #define GJ_SWITCHER_DIE         1
  235 #define GJ_SWITCHER_DIED        2
  236 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
  237 static int g_journal_switcher_wokenup = 0;
  238 static int g_journal_sync_requested = 0;
  239 
  240 #ifdef GJ_MEMDEBUG
  241 struct meminfo {
  242         size_t          mi_size;
  243         struct stack    mi_stack;
  244 };
  245 #endif
  246 
  247 /*
  248  * We use our own malloc/realloc/free funtions, so we can collect statistics
  249  * and force journal switch when we're running out of cache.
  250  */
  251 static void *
  252 gj_malloc(size_t size, int flags)
  253 {
  254         void *p;
  255 #ifdef GJ_MEMDEBUG
  256         struct meminfo *mi;
  257 #endif
  258 
  259         mtx_lock(&g_journal_cache_mtx);
  260         if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
  261             g_journal_cache_used + size > g_journal_cache_low) {
  262                 GJ_DEBUG(1, "No cache, waking up the switcher.");
  263                 g_journal_switcher_wokenup = 1;
  264                 wakeup(&g_journal_switcher_state);
  265         }
  266         if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
  267             g_journal_cache_used + size > g_journal_cache_limit) {
  268                 mtx_unlock(&g_journal_cache_mtx);
  269                 g_journal_cache_alloc_failures++;
  270                 return (NULL);
  271         }
  272         g_journal_cache_used += size;
  273         mtx_unlock(&g_journal_cache_mtx);
  274         flags &= ~M_NOWAIT;
  275 #ifndef GJ_MEMDEBUG
  276         p = malloc(size, M_JOURNAL, flags | M_WAITOK);
  277 #else
  278         mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
  279         p = (u_char *)mi + sizeof(*mi);
  280         mi->mi_size = size;
  281         stack_save(&mi->mi_stack);
  282 #endif
  283         return (p);
  284 }
  285 
  286 static void
  287 gj_free(void *p, size_t size)
  288 {
  289 #ifdef GJ_MEMDEBUG
  290         struct meminfo *mi;
  291 #endif
  292 
  293         KASSERT(p != NULL, ("p=NULL"));
  294         KASSERT(size > 0, ("size=0"));
  295         mtx_lock(&g_journal_cache_mtx);
  296         KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
  297         g_journal_cache_used -= size;
  298         mtx_unlock(&g_journal_cache_mtx);
  299 #ifdef GJ_MEMDEBUG
  300         mi = p = (void *)((u_char *)p - sizeof(*mi));
  301         if (mi->mi_size != size) {
  302                 printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
  303                     mi->mi_size);
  304                 printf("GJOURNAL: Alloc backtrace:\n");
  305                 stack_print(&mi->mi_stack);
  306                 printf("GJOURNAL: Free backtrace:\n");
  307                 kdb_backtrace();
  308         }
  309 #endif
  310         free(p, M_JOURNAL);
  311 }
  312 
  313 static void *
  314 gj_realloc(void *p, size_t size, size_t oldsize)
  315 {
  316         void *np;
  317 
  318 #ifndef GJ_MEMDEBUG
  319         mtx_lock(&g_journal_cache_mtx);
  320         g_journal_cache_used -= oldsize;
  321         g_journal_cache_used += size;
  322         mtx_unlock(&g_journal_cache_mtx);
  323         np = realloc(p, size, M_JOURNAL, M_WAITOK);
  324 #else
  325         np = gj_malloc(size, M_WAITOK);
  326         bcopy(p, np, MIN(oldsize, size));
  327         gj_free(p, oldsize);
  328 #endif
  329         return (np);
  330 }
  331 
  332 static void
  333 g_journal_check_overflow(struct g_journal_softc *sc)
  334 {
  335         off_t length, used;
  336 
  337         if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
  338              sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
  339             (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
  340              sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
  341              sc->sc_journal_offset < sc->sc_active.jj_offset)) {
  342                 panic("Journal overflow (joffset=%jd active=%jd inactive=%jd)",
  343                     (intmax_t)sc->sc_journal_offset,
  344                     (intmax_t)sc->sc_active.jj_offset,
  345                     (intmax_t)sc->sc_inactive.jj_offset);
  346         }
  347         if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
  348                 length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
  349                 used = sc->sc_journal_offset - sc->sc_active.jj_offset;
  350         } else {
  351                 length = sc->sc_jend - sc->sc_active.jj_offset;
  352                 length += sc->sc_inactive.jj_offset - sc->sc_jstart;
  353                 if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
  354                         used = sc->sc_journal_offset - sc->sc_active.jj_offset;
  355                 else {
  356                         used = sc->sc_jend - sc->sc_active.jj_offset;
  357                         used += sc->sc_journal_offset - sc->sc_jstart;
  358                 }
  359         }
  360         /* Already woken up? */
  361         if (g_journal_switcher_wokenup)
  362                 return;
  363         /*
  364          * If the active journal takes more than g_journal_force_switch precent
  365          * of free journal space, we force journal switch.
  366          */
  367         KASSERT(length > 0,
  368             ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
  369             (intmax_t)length, (intmax_t)used,
  370             (intmax_t)sc->sc_active.jj_offset,
  371             (intmax_t)sc->sc_inactive.jj_offset,
  372             (intmax_t)sc->sc_journal_offset));
  373         if ((used * 100) / length > g_journal_force_switch) {
  374                 g_journal_stats_journal_full++;
  375                 GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
  376                     sc->sc_name, (used * 100) / length);
  377                 mtx_lock(&g_journal_cache_mtx);
  378                 g_journal_switcher_wokenup = 1;
  379                 wakeup(&g_journal_switcher_state);
  380                 mtx_unlock(&g_journal_cache_mtx);
  381         }
  382 }
  383 
  384 static void
  385 g_journal_orphan(struct g_consumer *cp)
  386 {
  387         struct g_journal_softc *sc;
  388         char name[256];
  389         int error;
  390 
  391         g_topology_assert();
  392         sc = cp->geom->softc;
  393         strlcpy(name, cp->provider->name, sizeof(name));
  394         GJ_DEBUG(0, "Lost provider %s.", name);
  395         if (sc == NULL)
  396                 return;
  397         error = g_journal_destroy(sc);
  398         if (error == 0)
  399                 GJ_DEBUG(0, "Journal %s destroyed.", name);
  400         else {
  401                 GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
  402                     "Destroy it manually after last close.", sc->sc_name,
  403                     error);
  404         }
  405 }
  406 
  407 static int
  408 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
  409 {
  410         struct g_journal_softc *sc;
  411         int dcr, dcw, dce;
  412 
  413         g_topology_assert();
  414         GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
  415             acr, acw, ace);
  416 
  417         dcr = pp->acr + acr;
  418         dcw = pp->acw + acw;
  419         dce = pp->ace + ace;
  420 
  421         sc = pp->geom->softc;
  422         if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
  423                 if (acr <= 0 && acw <= 0 && ace <= 0)
  424                         return (0);
  425                 else
  426                         return (ENXIO);
  427         }
  428         if (pp->acw == 0 && dcw > 0) {
  429                 GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
  430                 sc->sc_flags &= ~GJF_DEVICE_CLEAN;
  431                 g_topology_unlock();
  432                 g_journal_metadata_update(sc);
  433                 g_topology_lock();
  434         } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
  435                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
  436                 sc->sc_flags |= GJF_DEVICE_CLEAN;
  437                 g_topology_unlock();
  438                 g_journal_metadata_update(sc);
  439                 g_topology_lock();
  440         } */
  441         return (0);
  442 }
  443 
  444 static void
  445 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
  446 {
  447 
  448         bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
  449         data += sizeof(GJ_HEADER_MAGIC);
  450         le32enc(data, hdr->jh_journal_id);
  451         data += 4;
  452         le32enc(data, hdr->jh_journal_next_id);
  453 }
  454 
  455 static int
  456 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
  457 {
  458 
  459         bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
  460         data += sizeof(hdr->jh_magic);
  461         if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
  462                 return (EINVAL);
  463         hdr->jh_journal_id = le32dec(data);
  464         data += 4;
  465         hdr->jh_journal_next_id = le32dec(data);
  466         return (0);
  467 }
  468 
  469 static void
  470 g_journal_flush_cache(struct g_journal_softc *sc)
  471 {
  472         struct bintime bt;
  473         int error;
  474 
  475         if (sc->sc_bio_flush == 0)
  476                 return;
  477         GJ_TIMER_START(1, &bt);
  478         if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
  479                 error = g_io_flush(sc->sc_jconsumer);
  480                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
  481                     sc->sc_jconsumer->provider->name, error);
  482         }
  483         if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
  484                 /*
  485                  * TODO: This could be called in parallel with the
  486                  *       previous call.
  487                  */
  488                 error = g_io_flush(sc->sc_dconsumer);
  489                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
  490                     sc->sc_dconsumer->provider->name, error);
  491         }
  492         GJ_TIMER_STOP(1, &bt, "Cache flush time");
  493 }
  494 
  495 static int
  496 g_journal_write_header(struct g_journal_softc *sc)
  497 {
  498         struct g_journal_header hdr;
  499         struct g_consumer *cp;
  500         u_char *buf;
  501         int error;
  502 
  503         cp = sc->sc_jconsumer;
  504         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
  505 
  506         strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
  507         hdr.jh_journal_id = sc->sc_journal_id;
  508         hdr.jh_journal_next_id = sc->sc_journal_next_id;
  509         g_journal_header_encode(&hdr, buf);
  510         error = g_write_data(cp, sc->sc_journal_offset, buf,
  511             cp->provider->sectorsize);
  512         /* if (error == 0) */
  513         sc->sc_journal_offset += cp->provider->sectorsize;
  514 
  515         gj_free(buf, cp->provider->sectorsize);
  516         return (error);
  517 }
  518 
  519 /*
  520  * Every journal record has a header and data following it.
  521  * Functions below are used to decode the header before storing it to
  522  * little endian and to encode it after reading to system endianess.
  523  */
  524 static void
  525 g_journal_record_header_encode(struct g_journal_record_header *hdr,
  526     u_char *data)
  527 {
  528         struct g_journal_entry *ent;
  529         u_int i;
  530 
  531         bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
  532         data += sizeof(GJ_RECORD_HEADER_MAGIC);
  533         le32enc(data, hdr->jrh_journal_id);
  534         data += 8;
  535         le16enc(data, hdr->jrh_nentries);
  536         data += 2;
  537         bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
  538         data += 8;
  539         for (i = 0; i < hdr->jrh_nentries; i++) {
  540                 ent = &hdr->jrh_entries[i];
  541                 le64enc(data, ent->je_joffset);
  542                 data += 8;
  543                 le64enc(data, ent->je_offset);
  544                 data += 8;
  545                 le64enc(data, ent->je_length);
  546                 data += 8;
  547         }
  548 }
  549 
  550 static int
  551 g_journal_record_header_decode(const u_char *data,
  552     struct g_journal_record_header *hdr)
  553 {
  554         struct g_journal_entry *ent;
  555         u_int i;
  556 
  557         bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
  558         data += sizeof(hdr->jrh_magic);
  559         if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
  560                 return (EINVAL);
  561         hdr->jrh_journal_id = le32dec(data);
  562         data += 8;
  563         hdr->jrh_nentries = le16dec(data);
  564         data += 2;
  565         if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
  566                 return (EINVAL);
  567         bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
  568         data += 8;
  569         for (i = 0; i < hdr->jrh_nentries; i++) {
  570                 ent = &hdr->jrh_entries[i];
  571                 ent->je_joffset = le64dec(data);
  572                 data += 8;
  573                 ent->je_offset = le64dec(data);
  574                 data += 8;
  575                 ent->je_length = le64dec(data);
  576                 data += 8;
  577         }
  578         return (0);
  579 }
  580 
  581 /*
  582  * Function reads metadata from a provider (via the given consumer), decodes
  583  * it to system endianess and verifies its correctness.
  584  */
  585 static int
  586 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
  587 {
  588         struct g_provider *pp;
  589         u_char *buf;
  590         int error;
  591 
  592         g_topology_assert();
  593 
  594         error = g_access(cp, 1, 0, 0);
  595         if (error != 0)
  596                 return (error);
  597         pp = cp->provider;
  598         g_topology_unlock();
  599         /* Metadata is stored in last sector. */
  600         buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
  601             &error);
  602         g_topology_lock();
  603         g_access(cp, -1, 0, 0);
  604         if (buf == NULL) {
  605                 GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
  606                     cp->provider->name, error);
  607                 return (error);
  608         }
  609 
  610         /* Decode metadata. */
  611         error = journal_metadata_decode(buf, md);
  612         g_free(buf);
  613         /* Is this is gjournal provider at all? */
  614         if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
  615                 return (EINVAL);
  616         /*
  617          * Are we able to handle this version of metadata?
  618          * We only maintain backward compatibility.
  619          */
  620         if (md->md_version > G_JOURNAL_VERSION) {
  621                 GJ_DEBUG(0,
  622                     "Kernel module is too old to handle metadata from %s.",
  623                     cp->provider->name);
  624                 return (EINVAL);
  625         }
  626         /* Is checksum correct? */
  627         if (error != 0) {
  628                 GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
  629                     cp->provider->name);
  630                 return (error);
  631         }
  632         return (0);
  633 }
  634 
  635 /*
  636  * Two functions below are responsible for updating metadata.
  637  * Only metadata on the data provider is updated (we need to update
  638  * information about active journal in there).
  639  */
  640 static void
  641 g_journal_metadata_done(struct bio *bp)
  642 {
  643 
  644         /*
  645          * There is not much we can do on error except informing about it.
  646          */
  647         if (bp->bio_error != 0) {
  648                 GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
  649                     bp->bio_error);
  650         } else {
  651                 GJ_LOGREQ(2, bp, "Metadata updated.");
  652         }
  653         gj_free(bp->bio_data, bp->bio_length);
  654         g_destroy_bio(bp);
  655 }
  656 
  657 static void
  658 g_journal_metadata_update(struct g_journal_softc *sc)
  659 {
  660         struct g_journal_metadata md;
  661         struct g_consumer *cp;
  662         struct bio *bp;
  663         u_char *sector;
  664 
  665         cp = sc->sc_dconsumer;
  666         sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
  667         strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
  668         md.md_version = G_JOURNAL_VERSION;
  669         md.md_id = sc->sc_id;
  670         md.md_type = sc->sc_orig_type;
  671         md.md_jstart = sc->sc_jstart;
  672         md.md_jend = sc->sc_jend;
  673         md.md_joffset = sc->sc_inactive.jj_offset;
  674         md.md_jid = sc->sc_journal_previous_id;
  675         md.md_flags = 0;
  676         if (sc->sc_flags & GJF_DEVICE_CLEAN)
  677                 md.md_flags |= GJ_FLAG_CLEAN;
  678 
  679         if (sc->sc_flags & GJF_DEVICE_HARDCODED)
  680                 strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
  681         else
  682                 bzero(md.md_provider, sizeof(md.md_provider));
  683         md.md_provsize = cp->provider->mediasize;
  684         journal_metadata_encode(&md, sector);
  685 
  686         /*
  687          * Flush the cache, so we know all data are on disk.
  688          * We write here informations like "journal is consistent", so we need
  689          * to be sure it is. Without BIO_FLUSH here, we can end up in situation
  690          * where metadata is stored on disk, but not all data.
  691          */
  692         g_journal_flush_cache(sc);
  693 
  694         bp = g_alloc_bio();
  695         bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
  696         bp->bio_length = cp->provider->sectorsize;
  697         bp->bio_data = sector;
  698         bp->bio_cmd = BIO_WRITE;
  699         if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
  700                 bp->bio_done = g_journal_metadata_done;
  701                 g_io_request(bp, cp);
  702         } else {
  703                 bp->bio_done = NULL;
  704                 g_io_request(bp, cp);
  705                 biowait(bp, "gjmdu");
  706                 g_journal_metadata_done(bp);
  707         }
  708 
  709         /*
  710          * Be sure metadata reached the disk.
  711          */
  712         g_journal_flush_cache(sc);
  713 }
  714 
  715 /*
  716  * This is where the I/O request comes from the GEOM.
  717  */
  718 static void
  719 g_journal_start(struct bio *bp)
  720 {
  721         struct g_journal_softc *sc;
  722 
  723         sc = bp->bio_to->geom->softc;
  724         GJ_LOGREQ(3, bp, "Request received.");
  725 
  726         switch (bp->bio_cmd) {
  727         case BIO_READ:
  728         case BIO_WRITE:
  729                 mtx_lock(&sc->sc_mtx);
  730                 bioq_insert_tail(&sc->sc_regular_queue, bp);
  731                 wakeup(sc);
  732                 mtx_unlock(&sc->sc_mtx);
  733                 return;
  734         case BIO_GETATTR:
  735                 if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
  736                         strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
  737                         bp->bio_completed = strlen(bp->bio_to->name) + 1;
  738                         g_io_deliver(bp, 0);
  739                         return;
  740                 }
  741                 /* FALLTHROUGH */
  742         case BIO_DELETE:
  743         default:
  744                 g_io_deliver(bp, EOPNOTSUPP);
  745                 return;
  746         }
  747 }
  748 
  749 static void
  750 g_journal_std_done(struct bio *bp)
  751 {
  752         struct g_journal_softc *sc;
  753 
  754         sc = bp->bio_from->geom->softc;
  755         mtx_lock(&sc->sc_mtx);
  756         bioq_insert_tail(&sc->sc_back_queue, bp);
  757         wakeup(sc);
  758         mtx_unlock(&sc->sc_mtx);
  759 }
  760 
  761 static struct bio *
  762 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
  763     int flags)
  764 {
  765         struct bio *bp;
  766 
  767         bp = g_alloc_bio();
  768         bp->bio_offset = start;
  769         bp->bio_joffset = joffset;
  770         bp->bio_length = end - start;
  771         bp->bio_cmd = BIO_WRITE;
  772         bp->bio_done = g_journal_std_done;
  773         if (data == NULL)
  774                 bp->bio_data = NULL;
  775         else {
  776                 bp->bio_data = gj_malloc(bp->bio_length, flags);
  777                 if (bp->bio_data != NULL)
  778                         bcopy(data, bp->bio_data, bp->bio_length);
  779         }
  780         return (bp);
  781 }
  782 
  783 #define g_journal_insert_bio(head, bp, flags)                           \
  784         g_journal_insert((head), (bp)->bio_offset,                      \
  785                 (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \
  786                 (bp)->bio_data, flags)
  787 /*
  788  * The function below does a lot more than just inserting bio to the queue.
  789  * It keeps the queue sorted by offset and ensures that there are no doubled
  790  * data (it combines bios where ranges overlap).
  791  *
  792  * The function returns the number of bios inserted (as bio can be splitted).
  793  */
  794 static int
  795 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
  796     u_char *data, int flags)
  797 {
  798         struct bio *nbp, *cbp, *pbp;
  799         off_t cstart, cend;
  800         u_char *tmpdata;
  801         int n;
  802 
  803         GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
  804             joffset);
  805         n = 0;
  806         pbp = NULL;
  807         GJQ_FOREACH(*head, cbp) {
  808                 cstart = cbp->bio_offset;
  809                 cend = cbp->bio_offset + cbp->bio_length;
  810 
  811                 if (nstart >= cend) {
  812                         /*
  813                          *  +-------------+
  814                          *  |             |
  815                          *  |   current   |  +-------------+
  816                          *  |     bio     |  |             |
  817                          *  |             |  |     new     |
  818                          *  +-------------+  |     bio     |
  819                          *                   |             |
  820                          *                   +-------------+
  821                          */
  822                         GJ_DEBUG(3, "INSERT(%p): 1", *head);
  823                 } else if (nend <= cstart) {
  824                         /*
  825                          *                   +-------------+
  826                          *                   |             |
  827                          *  +-------------+  |   current   |
  828                          *  |             |  |     bio     |
  829                          *  |     new     |  |             |
  830                          *  |     bio     |  +-------------+
  831                          *  |             |
  832                          *  +-------------+
  833                          */
  834                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
  835                             flags);
  836                         if (pbp == NULL)
  837                                 *head = nbp;
  838                         else
  839                                 pbp->bio_next = nbp;
  840                         nbp->bio_next = cbp;
  841                         n++;
  842                         GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
  843                             pbp);
  844                         goto end;
  845                 } else if (nstart <= cstart && nend >= cend) {
  846                         /*
  847                          *      +-------------+      +-------------+
  848                          *      | current bio |      | current bio |
  849                          *  +---+-------------+---+  +-------------+---+
  850                          *  |   |             |   |  |             |   |
  851                          *  |   |             |   |  |             |   |
  852                          *  |   +-------------+   |  +-------------+   |
  853                          *  |       new bio       |  |     new bio     |
  854                          *  +---------------------+  +-----------------+
  855                          *
  856                          *      +-------------+  +-------------+
  857                          *      | current bio |  | current bio |
  858                          *  +---+-------------+  +-------------+
  859                          *  |   |             |  |             |
  860                          *  |   |             |  |             |
  861                          *  |   +-------------+  +-------------+
  862                          *  |     new bio     |  |   new bio   |
  863                          *  +-----------------+  +-------------+
  864                          */
  865                         g_journal_stats_bytes_skipped += cbp->bio_length;
  866                         cbp->bio_offset = nstart;
  867                         cbp->bio_joffset = joffset;
  868                         cbp->bio_length = cend - nstart;
  869                         if (cbp->bio_data != NULL) {
  870                                 gj_free(cbp->bio_data, cend - cstart);
  871                                 cbp->bio_data = NULL;
  872                         }
  873                         if (data != NULL) {
  874                                 cbp->bio_data = gj_malloc(cbp->bio_length,
  875                                     flags);
  876                                 if (cbp->bio_data != NULL) {
  877                                         bcopy(data, cbp->bio_data,
  878                                             cbp->bio_length);
  879                                 }
  880                                 data += cend - nstart;
  881                         }
  882                         joffset += cend - nstart;
  883                         nstart = cend;
  884                         GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
  885                 } else if (nstart > cstart && nend >= cend) {
  886                         /*
  887                          *  +-----------------+  +-------------+
  888                          *  |   current bio   |  | current bio |
  889                          *  |   +-------------+  |   +---------+---+
  890                          *  |   |             |  |   |         |   |
  891                          *  |   |             |  |   |         |   |
  892                          *  +---+-------------+  +---+---------+   |
  893                          *      |   new bio   |      |   new bio   |
  894                          *      +-------------+      +-------------+
  895                          */
  896                         g_journal_stats_bytes_skipped += cend - nstart;
  897                         nbp = g_journal_new_bio(nstart, cend, joffset, data,
  898                             flags);
  899                         nbp->bio_next = cbp->bio_next;
  900                         cbp->bio_next = nbp;
  901                         cbp->bio_length = nstart - cstart;
  902                         if (cbp->bio_data != NULL) {
  903                                 cbp->bio_data = gj_realloc(cbp->bio_data,
  904                                     cbp->bio_length, cend - cstart);
  905                         }
  906                         if (data != NULL)
  907                                 data += cend - nstart;
  908                         joffset += cend - nstart;
  909                         nstart = cend;
  910                         n++;
  911                         GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
  912                 } else if (nstart > cstart && nend < cend) {
  913                         /*
  914                          *  +---------------------+
  915                          *  |     current bio     |
  916                          *  |   +-------------+   |
  917                          *  |   |             |   |
  918                          *  |   |             |   |
  919                          *  +---+-------------+---+
  920                          *      |   new bio   |
  921                          *      +-------------+
  922                          */
  923                         g_journal_stats_bytes_skipped += nend - nstart;
  924                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
  925                             flags);
  926                         nbp->bio_next = cbp->bio_next;
  927                         cbp->bio_next = nbp;
  928                         if (cbp->bio_data == NULL)
  929                                 tmpdata = NULL;
  930                         else
  931                                 tmpdata = cbp->bio_data + nend - cstart;
  932                         nbp = g_journal_new_bio(nend, cend,
  933                             cbp->bio_joffset + nend - cstart, tmpdata, flags);
  934                         nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
  935                         ((struct bio *)cbp->bio_next)->bio_next = nbp;
  936                         cbp->bio_length = nstart - cstart;
  937                         if (cbp->bio_data != NULL) {
  938                                 cbp->bio_data = gj_realloc(cbp->bio_data,
  939                                     cbp->bio_length, cend - cstart);
  940                         }
  941                         n += 2;
  942                         GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
  943                         goto end;
  944                 } else if (nstart <= cstart && nend < cend) {
  945                         /*
  946                          *  +-----------------+      +-------------+
  947                          *  |   current bio   |      | current bio |
  948                          *  +-------------+   |  +---+---------+   |
  949                          *  |             |   |  |   |         |   |
  950                          *  |             |   |  |   |         |   |
  951                          *  +-------------+---+  |   +---------+---+
  952                          *  |   new bio   |      |   new bio   |
  953                          *  +-------------+      +-------------+
  954                          */
  955                         g_journal_stats_bytes_skipped += nend - nstart;
  956                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
  957                             flags);
  958                         if (pbp == NULL)
  959                                 *head = nbp;
  960                         else
  961                                 pbp->bio_next = nbp;
  962                         nbp->bio_next = cbp;
  963                         cbp->bio_offset = nend;
  964                         cbp->bio_length = cend - nend;
  965                         cbp->bio_joffset += nend - cstart;
  966                         tmpdata = cbp->bio_data;
  967                         if (tmpdata != NULL) {
  968                                 cbp->bio_data = gj_malloc(cbp->bio_length,
  969                                     flags);
  970                                 if (cbp->bio_data != NULL) {
  971                                         bcopy(tmpdata + nend - cstart,
  972                                             cbp->bio_data, cbp->bio_length);
  973                                 }
  974                                 gj_free(tmpdata, cend - cstart);
  975                         }
  976                         n++;
  977                         GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
  978                         goto end;
  979                 }
  980                 if (nstart == nend)
  981                         goto end;
  982                 pbp = cbp;
  983         }
  984         nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
  985         if (pbp == NULL)
  986                 *head = nbp;
  987         else
  988                 pbp->bio_next = nbp;
  989         nbp->bio_next = NULL;
  990         n++;
  991         GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
  992 end:
  993         if (g_journal_debug >= 3) {
  994                 GJQ_FOREACH(*head, cbp) {
  995                         GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
  996                             (intmax_t)cbp->bio_offset,
  997                             (intmax_t)cbp->bio_length,
  998                             (intmax_t)cbp->bio_joffset, cbp->bio_data);
  999                 }
 1000                 GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
 1001         }
 1002         return (n);
 1003 }
 1004 
 1005 /*
 1006  * The function combines neighbour bios trying to squeeze as much data as
 1007  * possible into one bio.
 1008  *
 1009  * The function returns the number of bios combined (negative value).
 1010  */
 1011 static int
 1012 g_journal_optimize(struct bio *head)
 1013 {
 1014         struct bio *cbp, *pbp;
 1015         int n;
 1016 
 1017         n = 0;
 1018         pbp = NULL;
 1019         GJQ_FOREACH(head, cbp) {
 1020                 /* Skip bios which has to be read first. */
 1021                 if (cbp->bio_data == NULL) {
 1022                         pbp = NULL;
 1023                         continue;
 1024                 }
 1025                 /* There is no previous bio yet. */
 1026                 if (pbp == NULL) {
 1027                         pbp = cbp;
 1028                         continue;
 1029                 }
 1030                 /* Is this a neighbour bio? */
 1031                 if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
 1032                         /* Be sure that bios queue is sorted. */
 1033                         KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
 1034                             ("poffset=%jd plength=%jd coffset=%jd",
 1035                             (intmax_t)pbp->bio_offset,
 1036                             (intmax_t)pbp->bio_length,
 1037                             (intmax_t)cbp->bio_offset));
 1038                         pbp = cbp;
 1039                         continue;
 1040                 }
 1041                 /* Be sure we don't end up with too big bio. */
 1042                 if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
 1043                         pbp = cbp;
 1044                         continue;
 1045                 }
 1046                 /* Ok, we can join bios. */
 1047                 GJ_LOGREQ(4, pbp, "Join: ");
 1048                 GJ_LOGREQ(4, cbp, "and: ");
 1049                 pbp->bio_data = gj_realloc(pbp->bio_data,
 1050                     pbp->bio_length + cbp->bio_length, pbp->bio_length);
 1051                 bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
 1052                     cbp->bio_length);
 1053                 gj_free(cbp->bio_data, cbp->bio_length);
 1054                 pbp->bio_length += cbp->bio_length;
 1055                 pbp->bio_next = cbp->bio_next;
 1056                 g_destroy_bio(cbp);
 1057                 cbp = pbp;
 1058                 g_journal_stats_combined_ios++;
 1059                 n--;
 1060                 GJ_LOGREQ(4, pbp, "Got: ");
 1061         }
 1062         return (n);
 1063 }
 1064 
 1065 /*
 1066  * TODO: Update comment.
 1067  * These are functions responsible for copying one portion of data from journal
 1068  * to the destination provider.
 1069  * The order goes like this:
 1070  * 1. Read the header, which contains informations about data blocks
 1071  *    following it.
 1072  * 2. Read the data blocks from the journal.
 1073  * 3. Write the data blocks on the data provider.
 1074  *
 1075  * g_journal_copy_start()
 1076  * g_journal_copy_done() - got finished write request, logs potential errors.
 1077  */
 1078 
 1079 /*
 1080  * When there is no data in cache, this function is used to read it.
 1081  */
 1082 static void
 1083 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
 1084 {
 1085         struct bio *cbp;
 1086 
 1087         /*
 1088          * We were short in memory, so data was freed.
 1089          * In that case we need to read it back from journal.
 1090          */
 1091         cbp = g_alloc_bio();
 1092         cbp->bio_cflags = bp->bio_cflags;
 1093         cbp->bio_parent = bp;
 1094         cbp->bio_offset = bp->bio_joffset;
 1095         cbp->bio_length = bp->bio_length;
 1096         cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
 1097         cbp->bio_cmd = BIO_READ;
 1098         cbp->bio_done = g_journal_std_done;
 1099         GJ_LOGREQ(4, cbp, "READ FIRST");
 1100         g_io_request(cbp, sc->sc_jconsumer);
 1101         g_journal_cache_misses++;
 1102 }
 1103 
 1104 static void
 1105 g_journal_copy_send(struct g_journal_softc *sc)
 1106 {
 1107         struct bio *bioq, *bp, *lbp;
 1108 
 1109         bioq = lbp = NULL;
 1110         mtx_lock(&sc->sc_mtx);
 1111         for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
 1112                 bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
 1113                 if (bp == NULL)
 1114                         break;
 1115                 GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
 1116                 sc->sc_copy_in_progress++;
 1117                 GJQ_INSERT_AFTER(bioq, bp, lbp);
 1118                 lbp = bp;
 1119         }
 1120         mtx_unlock(&sc->sc_mtx);
 1121         if (g_journal_do_optimize)
 1122                 sc->sc_copy_in_progress += g_journal_optimize(bioq);
 1123         while ((bp = GJQ_FIRST(bioq)) != NULL) {
 1124                 GJQ_REMOVE(bioq, bp);
 1125                 GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
 1126                 bp->bio_cflags = GJ_BIO_COPY;
 1127                 if (bp->bio_data == NULL)
 1128                         g_journal_read_first(sc, bp);
 1129                 else {
 1130                         bp->bio_joffset = 0;
 1131                         GJ_LOGREQ(4, bp, "SEND");
 1132                         g_io_request(bp, sc->sc_dconsumer);
 1133                 }
 1134         }
 1135 }
 1136 
 1137 static void
 1138 g_journal_copy_start(struct g_journal_softc *sc)
 1139 {
 1140 
 1141         /*
 1142          * Remember in metadata that we're starting to copy journaled data
 1143          * to the data provider.
 1144          * In case of power failure, we will copy these data once again on boot.
 1145          */
 1146         if (!sc->sc_journal_copying) {
 1147                 sc->sc_journal_copying = 1;
 1148                 GJ_DEBUG(1, "Starting copy of journal.");
 1149                 g_journal_metadata_update(sc);
 1150         }
 1151         g_journal_copy_send(sc);
 1152 }
 1153 
 1154 /*
 1155  * Data block has been read from the journal provider.
 1156  */
 1157 static int
 1158 g_journal_copy_read_done(struct bio *bp)
 1159 {
 1160         struct g_journal_softc *sc;
 1161         struct g_consumer *cp;
 1162         struct bio *pbp;
 1163 
 1164         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 1165             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 1166 
 1167         sc = bp->bio_from->geom->softc;
 1168         pbp = bp->bio_parent;
 1169 
 1170         if (bp->bio_error != 0) {
 1171                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 1172                     bp->bio_to->name, bp->bio_error);
 1173                 /*
 1174                  * We will not be able to deliver WRITE request as well.
 1175                  */
 1176                 gj_free(bp->bio_data, bp->bio_length);
 1177                 g_destroy_bio(pbp);
 1178                 g_destroy_bio(bp);
 1179                 sc->sc_copy_in_progress--;
 1180                 return (1);
 1181         }
 1182         pbp->bio_data = bp->bio_data;
 1183         cp = sc->sc_dconsumer;
 1184         g_io_request(pbp, cp);
 1185         GJ_LOGREQ(4, bp, "READ DONE");
 1186         g_destroy_bio(bp);
 1187         return (0);
 1188 }
 1189 
 1190 /*
 1191  * Data block has been written to the data provider.
 1192  */
 1193 static void
 1194 g_journal_copy_write_done(struct bio *bp)
 1195 {
 1196         struct g_journal_softc *sc;
 1197 
 1198         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 1199             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 1200 
 1201         sc = bp->bio_from->geom->softc;
 1202         sc->sc_copy_in_progress--;
 1203 
 1204         if (bp->bio_error != 0) {
 1205                 GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
 1206                     bp->bio_error);
 1207         }
 1208         GJQ_REMOVE(sc->sc_copy_queue, bp);
 1209         gj_free(bp->bio_data, bp->bio_length);
 1210         GJ_LOGREQ(4, bp, "DONE");
 1211         g_destroy_bio(bp);
 1212 
 1213         if (sc->sc_copy_in_progress == 0) {
 1214                 /*
 1215                  * This was the last write request for this journal.
 1216                  */
 1217                 GJ_DEBUG(1, "Data has been copied.");
 1218                 sc->sc_journal_copying = 0;
 1219         }
 1220 }
 1221 
 1222 static void g_journal_flush_done(struct bio *bp);
 1223 
 1224 /*
 1225  * Flush one record onto active journal provider.
 1226  */
 1227 static void
 1228 g_journal_flush(struct g_journal_softc *sc)
 1229 {
 1230         struct g_journal_record_header hdr;
 1231         struct g_journal_entry *ent;
 1232         struct g_provider *pp;
 1233         struct bio **bioq;
 1234         struct bio *bp, *fbp, *pbp;
 1235         off_t joffset, size;
 1236         u_char *data, hash[16];
 1237         MD5_CTX ctx;
 1238         u_int i;
 1239 
 1240         if (sc->sc_current_count == 0)
 1241                 return;
 1242 
 1243         size = 0;
 1244         pp = sc->sc_jprovider;
 1245         GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 1246         joffset = sc->sc_journal_offset;
 1247 
 1248         GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
 1249             sc->sc_current_count, pp->name, (intmax_t)joffset);
 1250 
 1251         /*
 1252          * Store 'journal id', so we know to which journal this record belongs.
 1253          */
 1254         hdr.jrh_journal_id = sc->sc_journal_id;
 1255         /* Could be less than g_journal_record_entries if called due timeout. */
 1256         hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
 1257         strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
 1258 
 1259         bioq = &sc->sc_active.jj_queue;
 1260         pbp = sc->sc_flush_queue;
 1261 
 1262         fbp = g_alloc_bio();
 1263         fbp->bio_parent = NULL;
 1264         fbp->bio_cflags = GJ_BIO_JOURNAL;
 1265         fbp->bio_offset = -1;
 1266         fbp->bio_joffset = joffset;
 1267         fbp->bio_length = pp->sectorsize;
 1268         fbp->bio_cmd = BIO_WRITE;
 1269         fbp->bio_done = g_journal_std_done;
 1270         GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
 1271         pbp = fbp;
 1272         fbp->bio_to = pp;
 1273         GJ_LOGREQ(4, fbp, "FLUSH_OUT");
 1274         joffset += pp->sectorsize;
 1275         sc->sc_flush_count++;
 1276         if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 1277                 MD5Init(&ctx);
 1278 
 1279         for (i = 0; i < hdr.jrh_nentries; i++) {
 1280                 bp = sc->sc_current_queue;
 1281                 KASSERT(bp != NULL, ("NULL bp"));
 1282                 bp->bio_to = pp;
 1283                 GJ_LOGREQ(4, bp, "FLUSHED");
 1284                 sc->sc_current_queue = bp->bio_next;
 1285                 bp->bio_next = NULL;
 1286                 sc->sc_current_count--;
 1287 
 1288                 /* Add to the header. */
 1289                 ent = &hdr.jrh_entries[i];
 1290                 ent->je_offset = bp->bio_offset;
 1291                 ent->je_joffset = joffset;
 1292                 ent->je_length = bp->bio_length;
 1293                 size += ent->je_length;
 1294 
 1295                 data = bp->bio_data;
 1296                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 1297                         MD5Update(&ctx, data, ent->je_length);
 1298                 bzero(bp, sizeof(*bp));
 1299                 bp->bio_cflags = GJ_BIO_JOURNAL;
 1300                 bp->bio_offset = ent->je_offset;
 1301                 bp->bio_joffset = ent->je_joffset;
 1302                 bp->bio_length = ent->je_length;
 1303                 bp->bio_data = data;
 1304                 bp->bio_cmd = BIO_WRITE;
 1305                 bp->bio_done = g_journal_std_done;
 1306                 GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
 1307                 pbp = bp;
 1308                 bp->bio_to = pp;
 1309                 GJ_LOGREQ(4, bp, "FLUSH_OUT");
 1310                 joffset += bp->bio_length;
 1311                 sc->sc_flush_count++;
 1312 
 1313                 /*
 1314                  * Add request to the active sc_journal_queue queue.
 1315                  * This is our cache. After journal switch we don't have to
 1316                  * read the data from the inactive journal, because we keep
 1317                  * it in memory.
 1318                  */
 1319                 g_journal_insert(bioq, ent->je_offset,
 1320                     ent->je_offset + ent->je_length, ent->je_joffset, data,
 1321                     M_NOWAIT);
 1322         }
 1323 
 1324         /*
 1325          * After all requests, store valid header.
 1326          */
 1327         data = gj_malloc(pp->sectorsize, M_WAITOK);
 1328         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 1329                 MD5Final(hash, &ctx);
 1330                 bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
 1331         }
 1332         g_journal_record_header_encode(&hdr, data);
 1333         fbp->bio_data = data;
 1334 
 1335         sc->sc_journal_offset = joffset;
 1336 
 1337         g_journal_check_overflow(sc);
 1338 }
 1339 
 1340 /*
 1341  * Flush request finished.
 1342  */
 1343 static void
 1344 g_journal_flush_done(struct bio *bp)
 1345 {
 1346         struct g_journal_softc *sc;
 1347         struct g_consumer *cp;
 1348 
 1349         KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
 1350             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
 1351 
 1352         cp = bp->bio_from;
 1353         sc = cp->geom->softc;
 1354         sc->sc_flush_in_progress--;
 1355 
 1356         if (bp->bio_error != 0) {
 1357                 GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
 1358                     bp->bio_error);
 1359         }
 1360         gj_free(bp->bio_data, bp->bio_length);
 1361         GJ_LOGREQ(4, bp, "DONE");
 1362         g_destroy_bio(bp);
 1363 }
 1364 
 1365 static void g_journal_release_delayed(struct g_journal_softc *sc);
 1366 
 1367 static void
 1368 g_journal_flush_send(struct g_journal_softc *sc)
 1369 {
 1370         struct g_consumer *cp;
 1371         struct bio *bioq, *bp, *lbp;
 1372 
 1373         cp = sc->sc_jconsumer;
 1374         bioq = lbp = NULL;
 1375         while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
 1376                 /* Send one flush requests to the active journal. */
 1377                 bp = GJQ_FIRST(sc->sc_flush_queue);
 1378                 if (bp != NULL) {
 1379                         GJQ_REMOVE(sc->sc_flush_queue, bp);
 1380                         sc->sc_flush_count--;
 1381                         bp->bio_offset = bp->bio_joffset;
 1382                         bp->bio_joffset = 0;
 1383                         sc->sc_flush_in_progress++;
 1384                         GJQ_INSERT_AFTER(bioq, bp, lbp);
 1385                         lbp = bp;
 1386                 }
 1387                 /* Try to release delayed requests. */
 1388                 g_journal_release_delayed(sc);
 1389                 /* If there are no requests to flush, leave. */
 1390                 if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
 1391                         break;
 1392         }
 1393         if (g_journal_do_optimize)
 1394                 sc->sc_flush_in_progress += g_journal_optimize(bioq);
 1395         while ((bp = GJQ_FIRST(bioq)) != NULL) {
 1396                 GJQ_REMOVE(bioq, bp);
 1397                 GJ_LOGREQ(3, bp, "Flush request send");
 1398                 g_io_request(bp, cp);
 1399         }
 1400 }
 1401 
 1402 static void
 1403 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
 1404 {
 1405         int n;
 1406 
 1407         GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
 1408         n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
 1409         sc->sc_current_count += n;
 1410         n = g_journal_optimize(sc->sc_current_queue);
 1411         sc->sc_current_count += n;
 1412         /*
 1413          * For requests which are added to the current queue we deliver
 1414          * response immediately.
 1415          */
 1416         bp->bio_completed = bp->bio_length;
 1417         g_io_deliver(bp, 0);
 1418         if (sc->sc_current_count >= g_journal_record_entries) {
 1419                 /*
 1420                  * Let's flush one record onto active journal provider.
 1421                  */
 1422                 g_journal_flush(sc);
 1423         }
 1424 }
 1425 
 1426 static void
 1427 g_journal_release_delayed(struct g_journal_softc *sc)
 1428 {
 1429         struct bio *bp;
 1430 
 1431         for (;;) {
 1432                 /* The flush queue is full, exit. */
 1433                 if (sc->sc_flush_count >= g_journal_accept_immediately)
 1434                         return;
 1435                 bp = bioq_takefirst(&sc->sc_delayed_queue);
 1436                 if (bp == NULL)
 1437                         return;
 1438                 sc->sc_delayed_count--;
 1439                 g_journal_add_current(sc, bp);
 1440         }
 1441 }
 1442 
 1443 /*
 1444  * Add I/O request to the current queue. If we have enough requests for one
 1445  * journal record we flush them onto active journal provider.
 1446  */
 1447 static void
 1448 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
 1449 {
 1450 
 1451         /*
 1452          * The flush queue is full, we need to delay the request.
 1453          */
 1454         if (sc->sc_delayed_count > 0 ||
 1455             sc->sc_flush_count >= g_journal_accept_immediately) {
 1456                 GJ_LOGREQ(4, bp, "DELAYED");
 1457                 bioq_insert_tail(&sc->sc_delayed_queue, bp);
 1458                 sc->sc_delayed_count++;
 1459                 return;
 1460         }
 1461 
 1462         KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
 1463             ("DELAYED queue not empty."));
 1464         g_journal_add_current(sc, bp);
 1465 }
 1466 
 1467 static void g_journal_read_done(struct bio *bp);
 1468 
 1469 /*
 1470  * Try to find requested data in cache.
 1471  */
 1472 static struct bio *
 1473 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
 1474     off_t oend)
 1475 {
 1476         off_t cstart, cend;
 1477         struct bio *bp;
 1478 
 1479         GJQ_FOREACH(head, bp) {
 1480                 if (bp->bio_offset == -1)
 1481                         continue;
 1482                 cstart = MAX(ostart, bp->bio_offset);
 1483                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
 1484                 if (cend <= ostart)
 1485                         continue;
 1486                 else if (cstart >= oend) {
 1487                         if (!sorted)
 1488                                 continue;
 1489                         else {
 1490                                 bp = NULL;
 1491                                 break;
 1492                         }
 1493                 }
 1494                 if (bp->bio_data == NULL)
 1495                         break;
 1496                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 1497                     bp);
 1498                 bcopy(bp->bio_data + cstart - bp->bio_offset,
 1499                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 1500                 pbp->bio_completed += cend - cstart;
 1501                 if (pbp->bio_completed == pbp->bio_length) {
 1502                         /*
 1503                          * Cool, the whole request was in cache, deliver happy
 1504                          * message.
 1505                          */
 1506                         g_io_deliver(pbp, 0);
 1507                         return (pbp);
 1508                 }
 1509                 break;
 1510         }
 1511         return (bp);
 1512 }
 1513 
 1514 /*
 1515  * Try to find requested data in cache.
 1516  */
 1517 static struct bio *
 1518 g_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart,
 1519     off_t oend)
 1520 {
 1521         off_t cstart, cend;
 1522         struct bio *bp;
 1523 
 1524         TAILQ_FOREACH(bp, head, bio_queue) {
 1525                 cstart = MAX(ostart, bp->bio_offset);
 1526                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
 1527                 if (cend <= ostart)
 1528                         continue;
 1529                 else if (cstart >= oend)
 1530                         continue;
 1531                 KASSERT(bp->bio_data != NULL,
 1532                     ("%s: bio_data == NULL", __func__));
 1533                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 1534                     bp);
 1535                 bcopy(bp->bio_data + cstart - bp->bio_offset,
 1536                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 1537                 pbp->bio_completed += cend - cstart;
 1538                 if (pbp->bio_completed == pbp->bio_length) {
 1539                         /*
 1540                          * Cool, the whole request was in cache, deliver happy
 1541                          * message.
 1542                          */
 1543                         g_io_deliver(pbp, 0);
 1544                         return (pbp);
 1545                 }
 1546                 break;
 1547         }
 1548         return (bp);
 1549 }
 1550 
 1551 /*
 1552  * This function is used for colecting data on read.
 1553  * The complexity is because parts of the data can be stored in four different
 1554  * places:
 1555  * - in delayed requests
 1556  * - in memory - the data not yet send to the active journal provider
 1557  * - in requests which are going to be sent to the active journal
 1558  * - in the active journal
 1559  * - in the inactive journal
 1560  * - in the data provider
 1561  */
 1562 static void
 1563 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
 1564     off_t oend)
 1565 {
 1566         struct bio *bp, *nbp, *head;
 1567         off_t cstart, cend;
 1568         u_int i, sorted = 0;
 1569 
 1570         GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
 1571 
 1572         cstart = cend = -1;
 1573         bp = NULL;
 1574         head = NULL;
 1575         for (i = 0; i <= 5; i++) {
 1576                 switch (i) {
 1577                 case 0: /* Delayed requests. */
 1578                         head = NULL;
 1579                         sorted = 0;
 1580                         break;
 1581                 case 1: /* Not-yet-send data. */
 1582                         head = sc->sc_current_queue;
 1583                         sorted = 1;
 1584                         break;
 1585                 case 2: /* In-flight to the active journal. */
 1586                         head = sc->sc_flush_queue;
 1587                         sorted = 0;
 1588                         break;
 1589                 case 3: /* Active journal. */
 1590                         head = sc->sc_active.jj_queue;
 1591                         sorted = 1;
 1592                         break;
 1593                 case 4: /* Inactive journal. */
 1594                         /*
 1595                          * XXX: Here could be a race with g_journal_lowmem().
 1596                          */
 1597                         head = sc->sc_inactive.jj_queue;
 1598                         sorted = 1;
 1599                         break;
 1600                 case 5: /* In-flight to the data provider. */
 1601                         head = sc->sc_copy_queue;
 1602                         sorted = 0;
 1603                         break;
 1604                 default:
 1605                         panic("gjournal %s: i=%d", __func__, i);
 1606                 }
 1607                 if (i == 0)
 1608                         bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend);
 1609                 else
 1610                         bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
 1611                 if (bp == pbp) { /* Got the whole request. */
 1612                         GJ_DEBUG(2, "Got the whole request from %u.", i);
 1613                         return;
 1614                 } else if (bp != NULL) {
 1615                         cstart = MAX(ostart, bp->bio_offset);
 1616                         cend = MIN(oend, bp->bio_offset + bp->bio_length);
 1617                         GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
 1618                             i, (intmax_t)cstart, (intmax_t)cend);
 1619                         break;
 1620                 }
 1621         }
 1622         if (bp != NULL) {
 1623                 if (bp->bio_data == NULL) {
 1624                         nbp = g_duplicate_bio(pbp);
 1625                         nbp->bio_cflags = GJ_BIO_READ;
 1626                         nbp->bio_data =
 1627                             pbp->bio_data + cstart - pbp->bio_offset;
 1628                         nbp->bio_offset =
 1629                             bp->bio_joffset + cstart - bp->bio_offset;
 1630                         nbp->bio_length = cend - cstart;
 1631                         nbp->bio_done = g_journal_read_done;
 1632                         g_io_request(nbp, sc->sc_jconsumer);
 1633                 }
 1634                 /*
 1635                  * If we don't have the whole request yet, call g_journal_read()
 1636                  * recursively.
 1637                  */
 1638                 if (ostart < cstart)
 1639                         g_journal_read(sc, pbp, ostart, cstart);
 1640                 if (oend > cend)
 1641                         g_journal_read(sc, pbp, cend, oend);
 1642         } else {
 1643                 /*
 1644                  * No data in memory, no data in journal.
 1645                  * Its time for asking data provider.
 1646                  */
 1647                 GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
 1648                 nbp = g_duplicate_bio(pbp);
 1649                 nbp->bio_cflags = GJ_BIO_READ;
 1650                 nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
 1651                 nbp->bio_offset = ostart;
 1652                 nbp->bio_length = oend - ostart;
 1653                 nbp->bio_done = g_journal_read_done;
 1654                 g_io_request(nbp, sc->sc_dconsumer);
 1655                 /* We have the whole request, return here. */
 1656                 return;
 1657         }
 1658 }
 1659 
 1660 /*
 1661  * Function responsible for handling finished READ requests.
 1662  * Actually, g_std_done() could be used here, the only difference is that we
 1663  * log error.
 1664  */
 1665 static void
 1666 g_journal_read_done(struct bio *bp)
 1667 {
 1668         struct bio *pbp;
 1669 
 1670         KASSERT(bp->bio_cflags == GJ_BIO_READ,
 1671             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
 1672 
 1673         pbp = bp->bio_parent;
 1674         pbp->bio_inbed++;
 1675         pbp->bio_completed += bp->bio_length;
 1676 
 1677         if (bp->bio_error != 0) {
 1678                 if (pbp->bio_error == 0)
 1679                         pbp->bio_error = bp->bio_error;
 1680                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 1681                     bp->bio_to->name, bp->bio_error);
 1682         }
 1683         g_destroy_bio(bp);
 1684         if (pbp->bio_children == pbp->bio_inbed &&
 1685             pbp->bio_completed == pbp->bio_length) {
 1686                 /* We're done. */
 1687                 g_io_deliver(pbp, 0);
 1688         }
 1689 }
 1690 
 1691 /*
 1692  * Deactive current journal and active next one.
 1693  */
 1694 static void
 1695 g_journal_switch(struct g_journal_softc *sc)
 1696 {
 1697         struct g_provider *pp;
 1698 
 1699         if (JEMPTY(sc)) {
 1700                 GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 1701                 pp = LIST_FIRST(&sc->sc_geom->provider);
 1702                 if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
 1703                         sc->sc_flags |= GJF_DEVICE_CLEAN;
 1704                         GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 1705                         g_journal_metadata_update(sc);
 1706                 }
 1707         } else {
 1708                 GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
 1709 
 1710                 pp = sc->sc_jprovider;
 1711 
 1712                 sc->sc_journal_previous_id = sc->sc_journal_id;
 1713 
 1714                 sc->sc_journal_id = sc->sc_journal_next_id;
 1715                 sc->sc_journal_next_id = arc4random();
 1716 
 1717                 GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 1718 
 1719                 g_journal_write_header(sc);
 1720 
 1721                 sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
 1722                 sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
 1723 
 1724                 sc->sc_active.jj_offset =
 1725                     sc->sc_journal_offset - pp->sectorsize;
 1726                 sc->sc_active.jj_queue = NULL;
 1727 
 1728                 /*
 1729                  * Switch is done, start copying data from the (now) inactive
 1730                  * journal to the data provider.
 1731                  */
 1732                 g_journal_copy_start(sc);
 1733         }
 1734         mtx_lock(&sc->sc_mtx);
 1735         sc->sc_flags &= ~GJF_DEVICE_SWITCH;
 1736         mtx_unlock(&sc->sc_mtx);
 1737 }
 1738 
 1739 static void
 1740 g_journal_initialize(struct g_journal_softc *sc)
 1741 {
 1742 
 1743         sc->sc_journal_id = arc4random();
 1744         sc->sc_journal_next_id = arc4random();
 1745         sc->sc_journal_previous_id = sc->sc_journal_id;
 1746         sc->sc_journal_offset = sc->sc_jstart;
 1747         sc->sc_inactive.jj_offset = sc->sc_jstart;
 1748         g_journal_write_header(sc);
 1749         sc->sc_active.jj_offset = sc->sc_jstart;
 1750 }
 1751 
 1752 static void
 1753 g_journal_mark_as_dirty(struct g_journal_softc *sc)
 1754 {
 1755         const struct g_journal_desc *desc;
 1756         int i;
 1757 
 1758         GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
 1759         for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
 1760                 desc->jd_dirty(sc->sc_dconsumer);
 1761 }
 1762 
 1763 /*
 1764  * Function read record header from the given journal.
 1765  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
 1766  * and data on every call.
 1767  */
 1768 static int
 1769 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
 1770     void *data)
 1771 {
 1772         int error;
 1773 
 1774         bzero(bp, sizeof(*bp));
 1775         bp->bio_cmd = BIO_READ;
 1776         bp->bio_done = NULL;
 1777         bp->bio_offset = offset;
 1778         bp->bio_length = cp->provider->sectorsize;
 1779         bp->bio_data = data;
 1780         g_io_request(bp, cp);
 1781         error = biowait(bp, "gjs_read");
 1782         return (error);
 1783 }
 1784 
 1785 #if 0
 1786 /*
 1787  * Function is called when we start the journal device and we detect that
 1788  * one of the journals was not fully copied.
 1789  * The purpose of this function is to read all records headers from journal
 1790  * and placed them in the inactive queue, so we can start journal
 1791  * synchronization process and the journal provider itself.
 1792  * Design decision was taken to not synchronize the whole journal here as it
 1793  * can take too much time. Reading headers only and delaying synchronization
 1794  * process until after journal provider is started should be the best choice.
 1795  */
 1796 #endif
 1797 
 1798 static void
 1799 g_journal_sync(struct g_journal_softc *sc)
 1800 {
 1801         struct g_journal_record_header rhdr;
 1802         struct g_journal_entry *ent;
 1803         struct g_journal_header jhdr;
 1804         struct g_consumer *cp;
 1805         struct bio *bp, *fbp, *tbp;
 1806         off_t joffset, offset;
 1807         u_char *buf, sum[16];
 1808         uint64_t id;
 1809         MD5_CTX ctx;
 1810         int error, found, i;
 1811 
 1812         found = 0;
 1813         fbp = NULL;
 1814         cp = sc->sc_jconsumer;
 1815         bp = g_alloc_bio();
 1816         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 1817         offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
 1818 
 1819         GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
 1820 
 1821         /*
 1822          * Read and decode first journal header.
 1823          */
 1824         error = g_journal_sync_read(cp, bp, offset, buf);
 1825         if (error != 0) {
 1826                 GJ_DEBUG(0, "Error while reading journal header from %s.",
 1827                     cp->provider->name);
 1828                 goto end;
 1829         }
 1830         error = g_journal_header_decode(buf, &jhdr);
 1831         if (error != 0) {
 1832                 GJ_DEBUG(0, "Cannot decode journal header from %s.",
 1833                     cp->provider->name);
 1834                 goto end;
 1835         }
 1836         id = sc->sc_journal_id;
 1837         if (jhdr.jh_journal_id != sc->sc_journal_id) {
 1838                 GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
 1839                     (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
 1840                 goto end;
 1841         }
 1842         offset += cp->provider->sectorsize;
 1843         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 1844 
 1845         for (;;) {
 1846                 /*
 1847                  * If the biggest record won't fit, look for a record header or
 1848                  * journal header from the begining.
 1849                  */
 1850                 GJ_VALIDATE_OFFSET(offset, sc);
 1851                 error = g_journal_sync_read(cp, bp, offset, buf);
 1852                 if (error != 0) {
 1853                         /*
 1854                          * Not good. Having an error while reading header
 1855                          * means, that we cannot read next headers and in
 1856                          * consequence we cannot find termination.
 1857                          */
 1858                         GJ_DEBUG(0,
 1859                             "Error while reading record header from %s.",
 1860                             cp->provider->name);
 1861                         break;
 1862                 }
 1863 
 1864                 error = g_journal_record_header_decode(buf, &rhdr);
 1865                 if (error != 0) {
 1866                         GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
 1867                             (intmax_t)offset, error);
 1868                         /*
 1869                          * This is not a record header.
 1870                          * If we are lucky, this is next journal header.
 1871                          */
 1872                         error = g_journal_header_decode(buf, &jhdr);
 1873                         if (error != 0) {
 1874                                 GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
 1875                                     (intmax_t)offset, error);
 1876                                 /*
 1877                                  * Nope, this is not journal header, which
 1878                                  * bascially means that journal is not
 1879                                  * terminated properly.
 1880                                  */
 1881                                 error = ENOENT;
 1882                                 break;
 1883                         }
 1884                         /*
 1885                          * Ok. This is header of _some_ journal. Now we need to
 1886                          * verify if this is header of the _next_ journal.
 1887                          */
 1888                         if (jhdr.jh_journal_id != id) {
 1889                                 GJ_DEBUG(1, "Journal ID mismatch at %jd "
 1890                                     "(0x%08x != 0x%08x).", (intmax_t)offset,
 1891                                     (u_int)jhdr.jh_journal_id, (u_int)id);
 1892                                 error = ENOENT;
 1893                                 break;
 1894                         }
 1895 
 1896                         /* Found termination. */
 1897                         found++;
 1898                         GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
 1899                             (intmax_t)offset, (u_int)id);
 1900                         sc->sc_active.jj_offset = offset;
 1901                         sc->sc_journal_offset =
 1902                             offset + cp->provider->sectorsize;
 1903                         sc->sc_journal_id = id;
 1904                         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 1905 
 1906                         while ((tbp = fbp) != NULL) {
 1907                                 fbp = tbp->bio_next;
 1908                                 GJ_LOGREQ(3, tbp, "Adding request.");
 1909                                 g_journal_insert_bio(&sc->sc_inactive.jj_queue,
 1910                                     tbp, M_WAITOK);
 1911                         }
 1912 
 1913                         /* Skip journal's header. */
 1914                         offset += cp->provider->sectorsize;
 1915                         continue;
 1916                 }
 1917 
 1918                 /* Skip record's header. */
 1919                 offset += cp->provider->sectorsize;
 1920 
 1921                 /*
 1922                  * Add information about every record entry to the inactive
 1923                  * queue.
 1924                  */
 1925                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 1926                         MD5Init(&ctx);
 1927                 for (i = 0; i < rhdr.jrh_nentries; i++) {
 1928                         ent = &rhdr.jrh_entries[i];
 1929                         GJ_DEBUG(3, "Insert entry: %jd %jd.",
 1930                             (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
 1931                         g_journal_insert(&fbp, ent->je_offset,
 1932                             ent->je_offset + ent->je_length, ent->je_joffset,
 1933                             NULL, M_WAITOK);
 1934                         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 1935                                 u_char *buf2;
 1936 
 1937                                 /*
 1938                                  * TODO: Should use faster function (like
 1939                                  *       g_journal_sync_read()).
 1940                                  */
 1941                                 buf2 = g_read_data(cp, offset, ent->je_length,
 1942                                     NULL);
 1943                                 if (buf2 == NULL)
 1944                                         GJ_DEBUG(0, "Cannot read data at %jd.",
 1945                                             (intmax_t)offset);
 1946                                 else {
 1947                                         MD5Update(&ctx, buf2, ent->je_length);
 1948                                         g_free(buf2);
 1949                                 }
 1950                         }
 1951                         /* Skip entry's data. */
 1952                         offset += ent->je_length;
 1953                 }
 1954                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 1955                         MD5Final(sum, &ctx);
 1956                         if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
 1957                                 GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
 1958                                     (intmax_t)offset);
 1959                         }
 1960                 }
 1961         }
 1962 end:
 1963         gj_free(bp->bio_data, cp->provider->sectorsize);
 1964         g_destroy_bio(bp);
 1965 
 1966         /* Remove bios from unterminated journal. */
 1967         while ((tbp = fbp) != NULL) {
 1968                 fbp = tbp->bio_next;
 1969                 g_destroy_bio(tbp);
 1970         }
 1971 
 1972         if (found < 1 && joffset > 0) {
 1973                 GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
 1974                     sc->sc_name);
 1975                 while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
 1976                         sc->sc_inactive.jj_queue = tbp->bio_next;
 1977                         g_destroy_bio(tbp);
 1978                 }
 1979                 g_journal_initialize(sc);
 1980                 g_journal_mark_as_dirty(sc);
 1981         } else {
 1982                 GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
 1983                 g_journal_copy_start(sc);
 1984         }
 1985 }
 1986 
 1987 /*
 1988  * Wait for requests.
 1989  * If we have requests in the current queue, flush them after 3 seconds from the
 1990  * last flush. In this way we don't wait forever (or for journal switch) with
 1991  * storing not full records on journal.
 1992  */
 1993 static void
 1994 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
 1995 {
 1996         int error, timeout;
 1997 
 1998         GJ_DEBUG(3, "%s: enter", __func__);
 1999         if (sc->sc_current_count == 0) {
 2000                 if (g_journal_debug < 2)
 2001                         msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
 2002                 else {
 2003                         /*
 2004                          * If we have debug turned on, show number of elements
 2005                          * in various queues.
 2006                          */
 2007                         for (;;) {
 2008                                 error = msleep(sc, &sc->sc_mtx, PRIBIO,
 2009                                     "gj:work", hz * 3);
 2010                                 if (error == 0) {
 2011                                         mtx_unlock(&sc->sc_mtx);
 2012                                         break;
 2013                                 }
 2014                                 GJ_DEBUG(3, "Report: current count=%d",
 2015                                     sc->sc_current_count);
 2016                                 GJ_DEBUG(3, "Report: flush count=%d",
 2017                                     sc->sc_flush_count);
 2018                                 GJ_DEBUG(3, "Report: flush in progress=%d",
 2019                                     sc->sc_flush_in_progress);
 2020                                 GJ_DEBUG(3, "Report: copy in progress=%d",
 2021                                     sc->sc_copy_in_progress);
 2022                                 GJ_DEBUG(3, "Report: delayed=%d",
 2023                                     sc->sc_delayed_count);
 2024                         }
 2025                 }
 2026                 GJ_DEBUG(3, "%s: exit 1", __func__);
 2027                 return;
 2028         }
 2029 
 2030         /*
 2031          * Flush even not full records every 3 seconds.
 2032          */
 2033         timeout = (last_write + 3 - time_second) * hz;
 2034         if (timeout <= 0) {
 2035                 mtx_unlock(&sc->sc_mtx);
 2036                 g_journal_flush(sc);
 2037                 g_journal_flush_send(sc);
 2038                 GJ_DEBUG(3, "%s: exit 2", __func__);
 2039                 return;
 2040         }
 2041         error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
 2042         if (error == EWOULDBLOCK)
 2043                 g_journal_flush_send(sc);
 2044         GJ_DEBUG(3, "%s: exit 3", __func__);
 2045 }
 2046 
 2047 /*
 2048  * Worker thread.
 2049  */
 2050 static void
 2051 g_journal_worker(void *arg)
 2052 {
 2053         struct g_journal_softc *sc;
 2054         struct g_geom *gp;
 2055         struct g_provider *pp;
 2056         struct bio *bp;
 2057         time_t last_write;
 2058         int type;
 2059 
 2060         thread_lock(curthread);
 2061         sched_prio(curthread, PRIBIO);
 2062         thread_unlock(curthread);
 2063 
 2064         sc = arg;
 2065         type = 0;       /* gcc */
 2066 
 2067         if (sc->sc_flags & GJF_DEVICE_CLEAN) {
 2068                 GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
 2069                 g_journal_initialize(sc);
 2070         } else {
 2071                 g_journal_sync(sc);
 2072         }
 2073         /*
 2074          * Check if we can use BIO_FLUSH.
 2075          */
 2076         sc->sc_bio_flush = 0;
 2077         if (g_io_flush(sc->sc_jconsumer) == 0) {
 2078                 sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
 2079                 GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 2080                     sc->sc_jconsumer->provider->name);
 2081         } else {
 2082                 GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 2083                     sc->sc_jconsumer->provider->name);
 2084         }
 2085         if (sc->sc_jconsumer != sc->sc_dconsumer) {
 2086                 if (g_io_flush(sc->sc_dconsumer) == 0) {
 2087                         sc->sc_bio_flush |= GJ_FLUSH_DATA;
 2088                         GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 2089                             sc->sc_dconsumer->provider->name);
 2090                 } else {
 2091                         GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 2092                             sc->sc_dconsumer->provider->name);
 2093                 }
 2094         }
 2095 
 2096         gp = sc->sc_geom;
 2097         g_topology_lock();
 2098         pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
 2099         KASSERT(pp != NULL, ("Cannot create %s.journal.", sc->sc_name));
 2100         pp->mediasize = sc->sc_mediasize;
 2101         /*
 2102          * There could be a problem when data provider and journal providers
 2103          * have different sectorsize, but such scenario is prevented on journal
 2104          * creation.
 2105          */
 2106         pp->sectorsize = sc->sc_sectorsize;
 2107         g_error_provider(pp, 0);
 2108         g_topology_unlock();
 2109         last_write = time_second;
 2110 
 2111         for (;;) {
 2112                 /* Get first request from the queue. */
 2113                 mtx_lock(&sc->sc_mtx);
 2114                 bp = bioq_first(&sc->sc_back_queue);
 2115                 if (bp != NULL)
 2116                         type = (bp->bio_cflags & GJ_BIO_MASK);
 2117                 if (bp == NULL) {
 2118                         bp = bioq_first(&sc->sc_regular_queue);
 2119                         if (bp != NULL)
 2120                                 type = GJ_BIO_REGULAR;
 2121                 }
 2122                 if (bp == NULL) {
 2123 try_switch:
 2124                         if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
 2125                             (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 2126                                 if (sc->sc_current_count > 0) {
 2127                                         mtx_unlock(&sc->sc_mtx);
 2128                                         g_journal_flush(sc);
 2129                                         g_journal_flush_send(sc);
 2130                                         continue;
 2131                                 }
 2132                                 if (sc->sc_flush_in_progress > 0)
 2133                                         goto sleep;
 2134                                 if (sc->sc_copy_in_progress > 0)
 2135                                         goto sleep;
 2136                         }
 2137                         if (sc->sc_flags & GJF_DEVICE_SWITCH) {
 2138                                 mtx_unlock(&sc->sc_mtx);
 2139                                 g_journal_switch(sc);
 2140                                 wakeup(&sc->sc_journal_copying);
 2141                                 continue;
 2142                         }
 2143                         if (sc->sc_flags & GJF_DEVICE_DESTROY) {
 2144                                 GJ_DEBUG(1, "Shutting down worker "
 2145                                     "thread for %s.", gp->name);
 2146                                 sc->sc_worker = NULL;
 2147                                 wakeup(&sc->sc_worker);
 2148                                 mtx_unlock(&sc->sc_mtx);
 2149                                 kthread_exit(0);
 2150                         }
 2151 sleep:
 2152                         g_journal_wait(sc, last_write);
 2153                         continue;
 2154                 }
 2155                 /*
 2156                  * If we're in switch process, we need to delay all new
 2157                  * write requests until its done.
 2158                  */
 2159                 if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
 2160                     type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
 2161                         GJ_LOGREQ(2, bp, "WRITE on SWITCH");
 2162                         goto try_switch;
 2163                 }
 2164                 if (type == GJ_BIO_REGULAR)
 2165                         bioq_remove(&sc->sc_regular_queue, bp);
 2166                 else
 2167                         bioq_remove(&sc->sc_back_queue, bp);
 2168                 mtx_unlock(&sc->sc_mtx);
 2169                 switch (type) {
 2170                 case GJ_BIO_REGULAR:
 2171                         /* Regular request. */
 2172                         switch (bp->bio_cmd) {
 2173                         case BIO_READ:
 2174                                 g_journal_read(sc, bp, bp->bio_offset,
 2175                                     bp->bio_offset + bp->bio_length);
 2176                                 break;
 2177                         case BIO_WRITE:
 2178                                 last_write = time_second;
 2179                                 g_journal_add_request(sc, bp);
 2180                                 g_journal_flush_send(sc);
 2181                                 break;
 2182                         default:
 2183                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 2184                         }
 2185                         break;
 2186                 case GJ_BIO_COPY:
 2187                         switch (bp->bio_cmd) {
 2188                         case BIO_READ:
 2189                                 if (g_journal_copy_read_done(bp))
 2190                                         g_journal_copy_send(sc);
 2191                                 break;
 2192                         case BIO_WRITE:
 2193                                 g_journal_copy_write_done(bp);
 2194                                 g_journal_copy_send(sc);
 2195                                 break;
 2196                         default:
 2197                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 2198                         }
 2199                         break;
 2200                 case GJ_BIO_JOURNAL:
 2201                         g_journal_flush_done(bp);
 2202                         g_journal_flush_send(sc);
 2203                         break;
 2204                 case GJ_BIO_READ:
 2205                 default:
 2206                         panic("Invalid bio (%d).", type);
 2207                 }
 2208         }
 2209 }
 2210 
 2211 static void
 2212 g_journal_destroy_event(void *arg, int flags __unused)
 2213 {
 2214         struct g_journal_softc *sc;
 2215 
 2216         g_topology_assert();
 2217         sc = arg;
 2218         g_journal_destroy(sc);
 2219 }
 2220 
 2221 static void
 2222 g_journal_timeout(void *arg)
 2223 {
 2224         struct g_journal_softc *sc;
 2225 
 2226         sc = arg;
 2227         GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
 2228             sc->sc_geom->name);
 2229         g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
 2230 }
 2231 
 2232 static struct g_geom *
 2233 g_journal_create(struct g_class *mp, struct g_provider *pp,
 2234     const struct g_journal_metadata *md)
 2235 {
 2236         struct g_journal_softc *sc;
 2237         struct g_geom *gp;
 2238         struct g_consumer *cp;
 2239         int error;
 2240 
 2241         sc = NULL;      /* gcc */
 2242 
 2243         g_topology_assert();
 2244         /*
 2245          * There are two possibilities:
 2246          * 1. Data and both journals are on the same provider.
 2247          * 2. Data and journals are all on separated providers.
 2248          */
 2249         /* Look for journal device with the same ID. */
 2250         LIST_FOREACH(gp, &mp->geom, geom) {
 2251                 sc = gp->softc;
 2252                 if (sc == NULL)
 2253                         continue;
 2254                 if (sc->sc_id == md->md_id)
 2255                         break;
 2256         }
 2257         if (gp == NULL)
 2258                 sc = NULL;
 2259         else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
 2260                 GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
 2261                 return (NULL);
 2262         }
 2263         if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
 2264                 GJ_DEBUG(0, "Invalid type on %s.", pp->name);
 2265                 return (NULL);
 2266         }
 2267         if (md->md_type & GJ_TYPE_DATA) {
 2268                 GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
 2269                     pp->name);
 2270         }
 2271         if (md->md_type & GJ_TYPE_JOURNAL) {
 2272                 GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
 2273                     pp->name);
 2274         }
 2275 
 2276         if (sc == NULL) {
 2277                 /* Action geom. */
 2278                 sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
 2279                 sc->sc_id = md->md_id;
 2280                 sc->sc_type = 0;
 2281                 sc->sc_flags = 0;
 2282                 sc->sc_worker = NULL;
 2283 
 2284                 gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
 2285                 gp->start = g_journal_start;
 2286                 gp->orphan = g_journal_orphan;
 2287                 gp->access = g_journal_access;
 2288                 gp->softc = sc;
 2289                 sc->sc_geom = gp;
 2290 
 2291                 mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
 2292 
 2293                 bioq_init(&sc->sc_back_queue);
 2294                 bioq_init(&sc->sc_regular_queue);
 2295                 bioq_init(&sc->sc_delayed_queue);
 2296                 sc->sc_delayed_count = 0;
 2297                 sc->sc_current_queue = NULL;
 2298                 sc->sc_current_count = 0;
 2299                 sc->sc_flush_queue = NULL;
 2300                 sc->sc_flush_count = 0;
 2301                 sc->sc_flush_in_progress = 0;
 2302                 sc->sc_copy_queue = NULL;
 2303                 sc->sc_copy_in_progress = 0;
 2304                 sc->sc_inactive.jj_queue = NULL;
 2305                 sc->sc_active.jj_queue = NULL;
 2306 
 2307                 callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
 2308                 if (md->md_type != GJ_TYPE_COMPLETE) {
 2309                         /*
 2310                          * Journal and data are on separate providers.
 2311                          * At this point we have only one of them.
 2312                          * We setup a timeout in case the other part will not
 2313                          * appear, so we won't wait forever.
 2314                          */
 2315                         callout_reset(&sc->sc_callout, 5 * hz,
 2316                             g_journal_timeout, sc);
 2317                 }
 2318         }
 2319 
 2320         /* Remember type of the data provider. */
 2321         if (md->md_type & GJ_TYPE_DATA)
 2322                 sc->sc_orig_type = md->md_type;
 2323         sc->sc_type |= md->md_type;
 2324         cp = NULL;
 2325 
 2326         if (md->md_type & GJ_TYPE_DATA) {
 2327                 if (md->md_flags & GJ_FLAG_CLEAN)
 2328                         sc->sc_flags |= GJF_DEVICE_CLEAN;
 2329                 if (md->md_flags & GJ_FLAG_CHECKSUM)
 2330                         sc->sc_flags |= GJF_DEVICE_CHECKSUM;
 2331                 cp = g_new_consumer(gp);
 2332                 error = g_attach(cp, pp);
 2333                 KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 2334                     pp->name, error));
 2335                 error = g_access(cp, 1, 1, 1);
 2336                 if (error != 0) {
 2337                         GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
 2338                             error);
 2339                         g_journal_destroy(sc);
 2340                         return (NULL);
 2341                 }
 2342                 sc->sc_dconsumer = cp;
 2343                 sc->sc_mediasize = pp->mediasize - pp->sectorsize;
 2344                 sc->sc_sectorsize = pp->sectorsize;
 2345                 sc->sc_jstart = md->md_jstart;
 2346                 sc->sc_jend = md->md_jend;
 2347                 if (md->md_provider[0] != '\0')
 2348                         sc->sc_flags |= GJF_DEVICE_HARDCODED;
 2349                 sc->sc_journal_offset = md->md_joffset;
 2350                 sc->sc_journal_id = md->md_jid;
 2351                 sc->sc_journal_previous_id = md->md_jid;
 2352         }
 2353         if (md->md_type & GJ_TYPE_JOURNAL) {
 2354                 if (cp == NULL) {
 2355                         cp = g_new_consumer(gp);
 2356                         error = g_attach(cp, pp);
 2357                         KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 2358                             pp->name, error));
 2359                         error = g_access(cp, 1, 1, 1);
 2360                         if (error != 0) {
 2361                                 GJ_DEBUG(0, "Cannot access %s (error=%d).",
 2362                                     pp->name, error);
 2363                                 g_journal_destroy(sc);
 2364                                 return (NULL);
 2365                         }
 2366                 } else {
 2367                         /*
 2368                          * Journal is on the same provider as data, which means
 2369                          * that data provider ends where journal starts.
 2370                          */
 2371                         sc->sc_mediasize = md->md_jstart;
 2372                 }
 2373                 sc->sc_jconsumer = cp;
 2374         }
 2375 
 2376         if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
 2377                 /* Journal is not complete yet. */
 2378                 return (gp);
 2379         } else {
 2380                 /* Journal complete, cancel timeout. */
 2381                 callout_drain(&sc->sc_callout);
 2382         }
 2383 
 2384         error = kthread_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
 2385             "g_journal %s", sc->sc_name);
 2386         if (error != 0) {
 2387                 GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
 2388                     sc->sc_name);
 2389                 g_journal_destroy(sc);
 2390                 return (NULL);
 2391         }
 2392 
 2393         return (gp);
 2394 }
 2395 
 2396 static void
 2397 g_journal_destroy_consumer(void *arg, int flags __unused)
 2398 {
 2399         struct g_consumer *cp;
 2400 
 2401         g_topology_assert();
 2402         cp = arg;
 2403         g_detach(cp);
 2404         g_destroy_consumer(cp);
 2405 }
 2406 
 2407 static int
 2408 g_journal_destroy(struct g_journal_softc *sc)
 2409 {
 2410         struct g_geom *gp;
 2411         struct g_provider *pp;
 2412         struct g_consumer *cp;
 2413 
 2414         g_topology_assert();
 2415 
 2416         if (sc == NULL)
 2417                 return (ENXIO);
 2418 
 2419         gp = sc->sc_geom;
 2420         pp = LIST_FIRST(&gp->provider);
 2421         if (pp != NULL) {
 2422                 if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 2423                         GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
 2424                             pp->name, pp->acr, pp->acw, pp->ace);
 2425                         return (EBUSY);
 2426                 }
 2427                 g_error_provider(pp, ENXIO);
 2428 
 2429                 g_journal_flush(sc);
 2430                 g_journal_flush_send(sc);
 2431                 g_journal_switch(sc);
 2432         }
 2433 
 2434         sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
 2435 
 2436         g_topology_unlock();
 2437         callout_drain(&sc->sc_callout);
 2438         mtx_lock(&sc->sc_mtx);
 2439         wakeup(sc);
 2440         while (sc->sc_worker != NULL)
 2441                 msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
 2442         mtx_unlock(&sc->sc_mtx);
 2443 
 2444         if (pp != NULL) {
 2445                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 2446                 g_journal_metadata_update(sc);
 2447                 g_topology_lock();
 2448                 pp->flags |= G_PF_WITHER;
 2449                 g_orphan_provider(pp, ENXIO);
 2450         } else {
 2451                 g_topology_lock();
 2452         }
 2453         mtx_destroy(&sc->sc_mtx);
 2454 
 2455         if (sc->sc_current_count != 0) {
 2456                 GJ_DEBUG(0, "Warning! Number of current requests %d.",
 2457                     sc->sc_current_count);
 2458         }
 2459 
 2460         LIST_FOREACH(cp, &gp->consumer, consumer) {
 2461                 if (cp->acr + cp->acw + cp->ace > 0)
 2462                         g_access(cp, -1, -1, -1);
 2463                 /*
 2464                  * We keep all consumers open for writting, so if I'll detach
 2465                  * and destroy consumer here, I'll get providers for taste, so
 2466                  * journal will be started again.
 2467                  * Sending an event here, prevents this from happening.
 2468                  */
 2469                 g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
 2470         }
 2471         gp->softc = NULL;
 2472         g_wither_geom(gp, ENXIO);
 2473         free(sc, M_JOURNAL);
 2474         return (0);
 2475 }
 2476 
 2477 static void
 2478 g_journal_taste_orphan(struct g_consumer *cp)
 2479 {
 2480 
 2481         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 2482             cp->provider->name));
 2483 }
 2484 
 2485 static struct g_geom *
 2486 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 2487 {
 2488         struct g_journal_metadata md;
 2489         struct g_consumer *cp;
 2490         struct g_geom *gp;
 2491         int error;
 2492 
 2493         g_topology_assert();
 2494         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 2495         GJ_DEBUG(2, "Tasting %s.", pp->name);
 2496         if (pp->geom->class == mp)
 2497                 return (NULL);
 2498 
 2499         gp = g_new_geomf(mp, "journal:taste");
 2500         /* This orphan function should be never called. */
 2501         gp->orphan = g_journal_taste_orphan;
 2502         cp = g_new_consumer(gp);
 2503         g_attach(cp, pp);
 2504         error = g_journal_metadata_read(cp, &md);
 2505         g_detach(cp);
 2506         g_destroy_consumer(cp);
 2507         g_destroy_geom(gp);
 2508         if (error != 0)
 2509                 return (NULL);
 2510         gp = NULL;
 2511 
 2512         if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
 2513                 return (NULL);
 2514         if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 2515                 return (NULL);
 2516         if (g_journal_debug >= 2)
 2517                 journal_metadata_dump(&md);
 2518 
 2519         gp = g_journal_create(mp, pp, &md);
 2520         return (gp);
 2521 }
 2522 
 2523 static struct g_journal_softc *
 2524 g_journal_find_device(struct g_class *mp, const char *name)
 2525 {
 2526         struct g_journal_softc *sc;
 2527         struct g_geom *gp;
 2528         struct g_provider *pp;
 2529 
 2530         if (strncmp(name, "/dev/", 5) == 0)
 2531                 name += 5;
 2532         LIST_FOREACH(gp, &mp->geom, geom) {
 2533                 sc = gp->softc;
 2534                 if (sc == NULL)
 2535                         continue;
 2536                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
 2537                         continue;
 2538                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 2539                         continue;
 2540                 pp = LIST_FIRST(&gp->provider);
 2541                 if (strcmp(sc->sc_name, name) == 0)
 2542                         return (sc);
 2543                 if (pp != NULL && strcmp(pp->name, name) == 0)
 2544                         return (sc);
 2545         }
 2546         return (NULL);
 2547 }
 2548 
 2549 static void
 2550 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 2551 {
 2552         struct g_journal_softc *sc;
 2553         const char *name;
 2554         char param[16];
 2555         int *nargs;
 2556         int error, i;
 2557 
 2558         g_topology_assert();
 2559 
 2560         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 2561         if (nargs == NULL) {
 2562                 gctl_error(req, "No '%s' argument.", "nargs");
 2563                 return;
 2564         }
 2565         if (*nargs <= 0) {
 2566                 gctl_error(req, "Missing device(s).");
 2567                 return;
 2568         }
 2569 
 2570         for (i = 0; i < *nargs; i++) {
 2571                 snprintf(param, sizeof(param), "arg%d", i);
 2572                 name = gctl_get_asciiparam(req, param);
 2573                 if (name == NULL) {
 2574                         gctl_error(req, "No 'arg%d' argument.", i);
 2575                         return;
 2576                 }
 2577                 sc = g_journal_find_device(mp, name);
 2578                 if (sc == NULL) {
 2579                         gctl_error(req, "No such device: %s.", name);
 2580                         return;
 2581                 }
 2582                 error = g_journal_destroy(sc);
 2583                 if (error != 0) {
 2584                         gctl_error(req, "Cannot destroy device %s (error=%d).",
 2585                             LIST_FIRST(&sc->sc_geom->provider)->name, error);
 2586                         return;
 2587                 }
 2588         }
 2589 }
 2590 
 2591 static void
 2592 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
 2593 {
 2594 
 2595         g_topology_assert();
 2596         g_topology_unlock();
 2597         g_journal_sync_requested++;
 2598         wakeup(&g_journal_switcher_state);
 2599         while (g_journal_sync_requested > 0)
 2600                 tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
 2601         g_topology_lock();
 2602 }
 2603 
 2604 static void
 2605 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 2606 {
 2607         uint32_t *version;
 2608 
 2609         g_topology_assert();
 2610 
 2611         version = gctl_get_paraml(req, "version", sizeof(*version));
 2612         if (version == NULL) {
 2613                 gctl_error(req, "No '%s' argument.", "version");
 2614                 return;
 2615         }
 2616         if (*version != G_JOURNAL_VERSION) {
 2617                 gctl_error(req, "Userland and kernel parts are out of sync.");
 2618                 return;
 2619         }
 2620 
 2621         if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
 2622                 g_journal_ctl_destroy(req, mp);
 2623                 return;
 2624         } else if (strcmp(verb, "sync") == 0) {
 2625                 g_journal_ctl_sync(req, mp);
 2626                 return;
 2627         }
 2628 
 2629         gctl_error(req, "Unknown verb.");
 2630 }
 2631 
 2632 static void
 2633 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 2634     struct g_consumer *cp, struct g_provider *pp)
 2635 {
 2636         struct g_journal_softc *sc;
 2637 
 2638         g_topology_assert();
 2639 
 2640         sc = gp->softc;
 2641         if (sc == NULL)
 2642                 return;
 2643         if (pp != NULL) {
 2644                 /* Nothing here. */
 2645         } else if (cp != NULL) {
 2646                 int first = 1;
 2647 
 2648                 sbuf_printf(sb, "%s<Role>", indent);
 2649                 if (cp == sc->sc_dconsumer) {
 2650                         sbuf_printf(sb, "Data");
 2651                         first = 0;
 2652                 }
 2653                 if (cp == sc->sc_jconsumer) {
 2654                         if (!first)
 2655                                 sbuf_printf(sb, ",");
 2656                         sbuf_printf(sb, "Journal");
 2657                 }
 2658                 sbuf_printf(sb, "</Role>\n");
 2659                 if (cp == sc->sc_jconsumer) {
 2660                         sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
 2661                             (intmax_t)sc->sc_jstart);
 2662                         sbuf_printf(sb, "<Jend>%jd</Jend>\n",
 2663                             (intmax_t)sc->sc_jend);
 2664                 }
 2665         } else {
 2666                 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 2667         }
 2668 }
 2669 
 2670 static eventhandler_tag g_journal_event_shutdown = NULL;
 2671 static eventhandler_tag g_journal_event_lowmem = NULL;
 2672 
 2673 static void
 2674 g_journal_shutdown(void *arg, int howto __unused)
 2675 {
 2676         struct g_class *mp;
 2677         struct g_geom *gp, *gp2;
 2678 
 2679         if (panicstr != NULL)
 2680                 return;
 2681         mp = arg;
 2682         DROP_GIANT();
 2683         g_topology_lock();
 2684         LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 2685                 if (gp->softc == NULL)
 2686                         continue;
 2687                 GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
 2688                 g_journal_destroy(gp->softc);
 2689         }
 2690         g_topology_unlock();
 2691         PICKUP_GIANT();
 2692 }
 2693 
 2694 /*
 2695  * Free cached requests from inactive queue in case of low memory.
 2696  * We free GJ_FREE_AT_ONCE elements at once.
 2697  */
 2698 #define GJ_FREE_AT_ONCE 4
 2699 static void
 2700 g_journal_lowmem(void *arg, int howto __unused)
 2701 {
 2702         struct g_journal_softc *sc;
 2703         struct g_class *mp;
 2704         struct g_geom *gp;
 2705         struct bio *bp;
 2706         u_int nfree = GJ_FREE_AT_ONCE;
 2707 
 2708         g_journal_stats_low_mem++;
 2709         mp = arg;
 2710         DROP_GIANT();
 2711         g_topology_lock();
 2712         LIST_FOREACH(gp, &mp->geom, geom) {
 2713                 sc = gp->softc;
 2714                 if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
 2715                         continue;
 2716                 mtx_lock(&sc->sc_mtx);
 2717                 for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
 2718                     nfree--, bp = bp->bio_next) {
 2719                         /*
 2720                          * This is safe to free the bio_data, because:
 2721                          * 1. If bio_data is NULL it will be read from the
 2722                          *    inactive journal.
 2723                          * 2. If bp is sent down, it is first removed from the
 2724                          *    inactive queue, so it's impossible to free the
 2725                          *    data from under in-flight bio.
 2726                          * On the other hand, freeing elements from the active
 2727                          * queue, is not safe.
 2728                          */
 2729                         if (bp->bio_data != NULL) {
 2730                                 GJ_DEBUG(2, "Freeing data from %s.",
 2731                                     sc->sc_name);
 2732                                 gj_free(bp->bio_data, bp->bio_length);
 2733                                 bp->bio_data = NULL;
 2734                         }
 2735                 }
 2736                 mtx_unlock(&sc->sc_mtx);
 2737                 if (nfree == 0)
 2738                         break;
 2739         }
 2740         g_topology_unlock();
 2741         PICKUP_GIANT();
 2742 }
 2743 
 2744 static void g_journal_switcher(void *arg);
 2745 
 2746 static void
 2747 g_journal_init(struct g_class *mp)
 2748 {
 2749         int error;
 2750 
 2751         /* Pick a conservative value if provided value sucks. */
 2752         if (g_journal_cache_divisor <= 0 ||
 2753             (vm_kmem_size / g_journal_cache_divisor == 0)) {
 2754                 g_journal_cache_divisor = 5;
 2755         }
 2756         if (g_journal_cache_limit > 0) {
 2757                 g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
 2758                 g_journal_cache_low =
 2759                     (g_journal_cache_limit / 100) * g_journal_cache_switch;
 2760         }
 2761         g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
 2762             g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
 2763         if (g_journal_event_shutdown == NULL)
 2764                 GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
 2765         g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
 2766             g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
 2767         if (g_journal_event_lowmem == NULL)
 2768                 GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
 2769         error = kthread_create(g_journal_switcher, mp, NULL, 0, 0,
 2770             "g_journal switcher");
 2771         KASSERT(error == 0, ("Cannot create switcher thread."));
 2772 }
 2773 
 2774 static void
 2775 g_journal_fini(struct g_class *mp)
 2776 {
 2777 
 2778         if (g_journal_event_shutdown != NULL) {
 2779                 EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 2780                     g_journal_event_shutdown);
 2781         }
 2782         if (g_journal_event_lowmem != NULL)
 2783                 EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
 2784         g_journal_switcher_state = GJ_SWITCHER_DIE;
 2785         wakeup(&g_journal_switcher_state);
 2786         while (g_journal_switcher_state != GJ_SWITCHER_DIED)
 2787                 tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
 2788         GJ_DEBUG(1, "Switcher died.");
 2789 }
 2790 
 2791 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
 2792 
 2793 static const struct g_journal_desc *
 2794 g_journal_find_desc(const char *fstype)
 2795 {
 2796         const struct g_journal_desc *desc;
 2797         int i;
 2798 
 2799         for (desc = g_journal_filesystems[i = 0]; desc != NULL;
 2800              desc = g_journal_filesystems[++i]) {
 2801                 if (strcmp(desc->jd_fstype, fstype) == 0)
 2802                         break;
 2803         }
 2804         return (desc);
 2805 }
 2806 
 2807 static void
 2808 g_journal_switch_wait(struct g_journal_softc *sc)
 2809 {
 2810         struct bintime bt;
 2811 
 2812         mtx_assert(&sc->sc_mtx, MA_OWNED);
 2813         if (g_journal_debug >= 2) {
 2814                 if (sc->sc_flush_in_progress > 0) {
 2815                         GJ_DEBUG(2, "%d requests flushing.",
 2816                             sc->sc_flush_in_progress);
 2817                 }
 2818                 if (sc->sc_copy_in_progress > 0) {
 2819                         GJ_DEBUG(2, "%d requests copying.",
 2820                             sc->sc_copy_in_progress);
 2821                 }
 2822                 if (sc->sc_flush_count > 0) {
 2823                         GJ_DEBUG(2, "%d requests to flush.",
 2824                             sc->sc_flush_count);
 2825                 }
 2826                 if (sc->sc_delayed_count > 0) {
 2827                         GJ_DEBUG(2, "%d requests delayed.",
 2828                             sc->sc_delayed_count);
 2829                 }
 2830         }
 2831         g_journal_stats_switches++;
 2832         if (sc->sc_copy_in_progress > 0)
 2833                 g_journal_stats_wait_for_copy++;
 2834         GJ_TIMER_START(1, &bt);
 2835         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 2836         sc->sc_flags |= GJF_DEVICE_SWITCH;
 2837         wakeup(sc);
 2838         while (sc->sc_flags & GJF_DEVICE_SWITCH) {
 2839                 msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
 2840                     "gj:switch", 0);
 2841         }
 2842         GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
 2843 }
 2844 
 2845 static void
 2846 g_journal_do_switch(struct g_class *classp, struct thread *td)
 2847 {
 2848         struct g_journal_softc *sc;
 2849         const struct g_journal_desc *desc;
 2850         struct g_geom *gp;
 2851         struct mount *mp;
 2852         struct bintime bt;
 2853         char *mountpoint;
 2854         int error, vfslocked;
 2855 
 2856         DROP_GIANT();
 2857         g_topology_lock();
 2858         LIST_FOREACH(gp, &classp->geom, geom) {
 2859                 sc = gp->softc;
 2860                 if (sc == NULL)
 2861                         continue;
 2862                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
 2863                         continue;
 2864                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 2865                         continue;
 2866                 mtx_lock(&sc->sc_mtx);
 2867                 sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
 2868                 mtx_unlock(&sc->sc_mtx);
 2869         }
 2870         g_topology_unlock();
 2871         PICKUP_GIANT();
 2872 
 2873         mtx_lock(&mountlist_mtx);
 2874         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 2875                 if (mp->mnt_gjprovider == NULL)
 2876                         continue;
 2877                 if (mp->mnt_flag & MNT_RDONLY)
 2878                         continue;
 2879                 desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
 2880                 if (desc == NULL)
 2881                         continue;
 2882                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 2883                         continue;
 2884                 /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
 2885 
 2886                 DROP_GIANT();
 2887                 g_topology_lock();
 2888                 sc = g_journal_find_device(classp, mp->mnt_gjprovider);
 2889                 g_topology_unlock();
 2890                 PICKUP_GIANT();
 2891 
 2892                 if (sc == NULL) {
 2893                         GJ_DEBUG(0, "Cannot find journal geom for %s.",
 2894                             mp->mnt_gjprovider);
 2895                         goto next;
 2896                 } else if (JEMPTY(sc)) {
 2897                         mtx_lock(&sc->sc_mtx);
 2898                         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 2899                         mtx_unlock(&sc->sc_mtx);
 2900                         GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 2901                         goto next;
 2902                 }
 2903 
 2904                 mountpoint = mp->mnt_stat.f_mntonname;
 2905 
 2906                 vfslocked = VFS_LOCK_GIANT(mp);
 2907 
 2908                 error = vn_start_write(NULL, &mp, V_WAIT);
 2909                 if (error != 0) {
 2910                         VFS_UNLOCK_GIANT(vfslocked);
 2911                         GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
 2912                             mountpoint, error);
 2913                         goto next;
 2914                 }
 2915 
 2916                 MNT_ILOCK(mp);
 2917                 mp->mnt_noasync++;
 2918                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
 2919                 MNT_IUNLOCK(mp);
 2920 
 2921                 GJ_TIMER_START(1, &bt);
 2922                 vfs_msync(mp, MNT_NOWAIT);
 2923                 GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
 2924 
 2925                 GJ_TIMER_START(1, &bt);
 2926                 error = VFS_SYNC(mp, MNT_NOWAIT, curthread);
 2927                 if (error == 0)
 2928                         GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
 2929                 else {
 2930                         GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
 2931                             mountpoint, error);
 2932                 }
 2933 
 2934                 MNT_ILOCK(mp);
 2935                 mp->mnt_noasync--;
 2936                 if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 2937                         mp->mnt_kern_flag |= MNTK_ASYNC;
 2938                 MNT_IUNLOCK(mp);
 2939 
 2940                 vn_finished_write(mp);
 2941 
 2942                 if (error != 0) {
 2943                         VFS_UNLOCK_GIANT(vfslocked);
 2944                         goto next;
 2945                 }
 2946 
 2947                 /*
 2948                  * Send BIO_FLUSH before freezing the file system, so it can be
 2949                  * faster after the freeze.
 2950                  */
 2951                 GJ_TIMER_START(1, &bt);
 2952                 g_journal_flush_cache(sc);
 2953                 GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
 2954 
 2955                 GJ_TIMER_START(1, &bt);
 2956                 error = vfs_write_suspend(mp);
 2957                 VFS_UNLOCK_GIANT(vfslocked);
 2958                 GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
 2959                 if (error != 0) {
 2960                         GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
 2961                             mountpoint, error);
 2962                         goto next;
 2963                 }
 2964 
 2965                 error = desc->jd_clean(mp);
 2966                 if (error != 0)
 2967                         goto next;
 2968 
 2969                 mtx_lock(&sc->sc_mtx);
 2970                 g_journal_switch_wait(sc);
 2971                 mtx_unlock(&sc->sc_mtx);
 2972 
 2973                 vfs_write_resume(mp);
 2974 next:
 2975                 mtx_lock(&mountlist_mtx);
 2976                 vfs_unbusy(mp, td);
 2977         }
 2978         mtx_unlock(&mountlist_mtx);
 2979 
 2980         sc = NULL;
 2981         for (;;) {
 2982                 DROP_GIANT();
 2983                 g_topology_lock();
 2984                 LIST_FOREACH(gp, &g_journal_class.geom, geom) {
 2985                         sc = gp->softc;
 2986                         if (sc == NULL)
 2987                                 continue;
 2988                         mtx_lock(&sc->sc_mtx);
 2989                         if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
 2990                             !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
 2991                             (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
 2992                                 break;
 2993                         }
 2994                         mtx_unlock(&sc->sc_mtx);
 2995                         sc = NULL;
 2996                 }
 2997                 g_topology_unlock();
 2998                 PICKUP_GIANT();
 2999                 if (sc == NULL)
 3000                         break;
 3001                 mtx_assert(&sc->sc_mtx, MA_OWNED);
 3002                 g_journal_switch_wait(sc);
 3003                 mtx_unlock(&sc->sc_mtx);
 3004         }
 3005 }
 3006 
 3007 /*
 3008  * TODO: Switcher thread should be started on first geom creation and killed on
 3009  * last geom destruction.
 3010  */
 3011 static void
 3012 g_journal_switcher(void *arg)
 3013 {
 3014         struct thread *td = curthread;
 3015         struct g_class *mp;
 3016         struct bintime bt;
 3017         int error;
 3018 
 3019         mp = arg;
 3020         for (;;) {
 3021                 g_journal_switcher_wokenup = 0;
 3022                 error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
 3023                     g_journal_switch_time * hz);
 3024                 if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
 3025                         g_journal_switcher_state = GJ_SWITCHER_DIED;
 3026                         GJ_DEBUG(1, "Switcher exiting.");
 3027                         wakeup(&g_journal_switcher_state);
 3028                         kthread_exit(0);
 3029                 }
 3030                 if (error == 0 && g_journal_sync_requested == 0) {
 3031                         GJ_DEBUG(1, "Out of cache, force switch (used=%u "
 3032                             "limit=%u).", g_journal_cache_used,
 3033                             g_journal_cache_limit);
 3034                 }
 3035                 GJ_TIMER_START(1, &bt);
 3036                 g_journal_do_switch(mp, td);
 3037                 GJ_TIMER_STOP(1, &bt, "Entire switch time");
 3038                 if (g_journal_sync_requested > 0) {
 3039                         g_journal_sync_requested = 0;
 3040                         wakeup(&g_journal_sync_requested);
 3041                 }
 3042         }
 3043 }

Cache object: ad235bd00188d049fce0cb2959af50b6


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.