The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_epoch.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  *
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 #include <sys/param.h>
   33 #include <sys/systm.h>
   34 #include <sys/counter.h>
   35 #include <sys/epoch.h>
   36 #include <sys/gtaskqueue.h>
   37 #include <sys/kernel.h>
   38 #include <sys/limits.h>
   39 #include <sys/lock.h>
   40 #include <sys/malloc.h>
   41 #include <sys/mutex.h>
   42 #include <sys/pcpu.h>
   43 #include <sys/proc.h>
   44 #include <sys/sched.h>
   45 #include <sys/sx.h>
   46 #include <sys/smp.h>
   47 #include <sys/sysctl.h>
   48 #include <sys/turnstile.h>
   49 #ifdef EPOCH_TRACE
   50 #include <machine/stdarg.h>
   51 #include <sys/stack.h>
   52 #include <sys/tree.h>
   53 #endif
   54 #include <vm/vm.h>
   55 #include <vm/vm_extern.h>
   56 #include <vm/vm_kern.h>
   57 #include <vm/uma.h>
   58 
   59 #include <ck_epoch.h>
   60 
   61 #ifdef __amd64__
   62 #define EPOCH_ALIGN CACHE_LINE_SIZE*2
   63 #else
   64 #define EPOCH_ALIGN CACHE_LINE_SIZE
   65 #endif
   66 
   67 TAILQ_HEAD (epoch_tdlist, epoch_tracker);
   68 typedef struct epoch_record {
   69         ck_epoch_record_t er_record;
   70         struct epoch_context er_drain_ctx;
   71         struct epoch *er_parent;
   72         volatile struct epoch_tdlist er_tdlist;
   73         volatile uint32_t er_gen;
   74         uint32_t er_cpuid;
   75 #ifdef INVARIANTS
   76         /* Used to verify record ownership for non-preemptible epochs. */
   77         struct thread *er_td;
   78 #endif
   79 } __aligned(EPOCH_ALIGN)     *epoch_record_t;
   80 
   81 struct epoch {
   82         struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
   83         epoch_record_t e_pcpu_record;
   84         int     e_in_use;
   85         int     e_flags;
   86         struct sx e_drain_sx;
   87         struct mtx e_drain_mtx;
   88         volatile int e_drain_count;
   89         const char *e_name;
   90 };
   91 
   92 /* arbitrary --- needs benchmarking */
   93 #define MAX_ADAPTIVE_SPIN 100
   94 #define MAX_EPOCHS 64
   95 
   96 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
   97 SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
   98     "epoch information");
   99 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  100     "epoch stats");
  101 
  102 /* Stats. */
  103 static counter_u64_t block_count;
  104 
  105 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
  106     &block_count, "# of times a thread was in an epoch when epoch_wait was called");
  107 static counter_u64_t migrate_count;
  108 
  109 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
  110     &migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
  111 static counter_u64_t turnstile_count;
  112 
  113 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
  114     &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
  115 static counter_u64_t switch_count;
  116 
  117 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
  118     &switch_count, "# of times a thread voluntarily context switched in epoch_wait");
  119 static counter_u64_t epoch_call_count;
  120 
  121 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW,
  122     &epoch_call_count, "# of times a callback was deferred");
  123 static counter_u64_t epoch_call_task_count;
  124 
  125 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW,
  126     &epoch_call_task_count, "# of times a callback task was run");
  127 
  128 TAILQ_HEAD (threadlist, thread);
  129 
  130 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
  131     ck_epoch_entry_container)
  132 
  133 static struct epoch epoch_array[MAX_EPOCHS];
  134 
  135 DPCPU_DEFINE(struct grouptask, epoch_cb_task);
  136 DPCPU_DEFINE(int, epoch_cb_count);
  137 
  138 static __read_mostly int inited;
  139 __read_mostly epoch_t global_epoch;
  140 __read_mostly epoch_t global_epoch_preempt;
  141 
  142 static void epoch_call_task(void *context __unused);
  143 static  uma_zone_t pcpu_zone_record;
  144 
  145 static struct sx epoch_sx;
  146 
  147 #define EPOCH_LOCK() sx_xlock(&epoch_sx)
  148 #define EPOCH_UNLOCK() sx_xunlock(&epoch_sx)
  149 
  150 #ifdef EPOCH_TRACE
  151 struct stackentry {
  152         RB_ENTRY(stackentry) se_node;
  153         struct stack se_stack;
  154 };
  155 
  156 static int
  157 stackentry_compare(struct stackentry *a, struct stackentry *b)
  158 {
  159 
  160         if (a->se_stack.depth > b->se_stack.depth)
  161                 return (1);
  162         if (a->se_stack.depth < b->se_stack.depth)
  163                 return (-1);
  164         for (int i = 0; i < a->se_stack.depth; i++) {
  165                 if (a->se_stack.pcs[i] > b->se_stack.pcs[i])
  166                         return (1);
  167                 if (a->se_stack.pcs[i] < b->se_stack.pcs[i])
  168                         return (-1);
  169         }
  170 
  171         return (0);
  172 }
  173 
  174 RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks);
  175 RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare);
  176 
  177 static struct mtx epoch_stacks_lock;
  178 MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF);
  179 
  180 static bool epoch_trace_stack_print = true;
  181 SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN,
  182     &epoch_trace_stack_print, 0, "Print stack traces on epoch reports");
  183 
  184 static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2);
  185 static inline void
  186 epoch_trace_report(const char *fmt, ...)
  187 {
  188         va_list ap;
  189         struct stackentry se, *new;
  190 
  191         stack_zero(&se.se_stack);       /* XXX: is it really needed? */
  192         stack_save(&se.se_stack);
  193 
  194         /* Tree is never reduced - go lockless. */
  195         if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL)
  196                 return;
  197 
  198         new = malloc(sizeof(*new), M_STACK, M_NOWAIT);
  199         if (new != NULL) {
  200                 bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack));
  201 
  202                 mtx_lock(&epoch_stacks_lock);
  203                 new = RB_INSERT(stacktree, &epoch_stacks, new);
  204                 mtx_unlock(&epoch_stacks_lock);
  205                 if (new != NULL)
  206                         free(new, M_STACK);
  207         }
  208 
  209         va_start(ap, fmt);
  210         (void)vprintf(fmt, ap);
  211         va_end(ap);
  212         if (epoch_trace_stack_print)
  213                 stack_print_ddb(&se.se_stack);
  214 }
  215 
  216 static inline void
  217 epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et,
  218     const char *file, int line)
  219 {
  220         epoch_tracker_t iet;
  221 
  222         SLIST_FOREACH(iet, &td->td_epochs, et_tlink) {
  223                 if (iet->et_epoch != epoch)
  224                         continue;
  225                 epoch_trace_report("Recursively entering epoch %s "
  226                     "at %s:%d, previously entered at %s:%d\n",
  227                     epoch->e_name, file, line,
  228                     iet->et_file, iet->et_line);
  229         }
  230         et->et_epoch = epoch;
  231         et->et_file = file;
  232         et->et_line = line;
  233         SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink);
  234 }
  235 
  236 static inline void
  237 epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et,
  238     const char *file, int line)
  239 {
  240 
  241         if (SLIST_FIRST(&td->td_epochs) != et) {
  242                 epoch_trace_report("Exiting epoch %s in a not nested order "
  243                     "at %s:%d. Most recently entered %s at %s:%d\n",
  244                     epoch->e_name,
  245                     file, line,
  246                     SLIST_FIRST(&td->td_epochs)->et_epoch->e_name,
  247                     SLIST_FIRST(&td->td_epochs)->et_file,
  248                     SLIST_FIRST(&td->td_epochs)->et_line);
  249                 /* This will panic if et is not anywhere on td_epochs. */
  250                 SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink);
  251         } else
  252                 SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink);
  253 }
  254 
  255 /* Used by assertions that check thread state before going to sleep. */
  256 void
  257 epoch_trace_list(struct thread *td)
  258 {
  259         epoch_tracker_t iet;
  260 
  261         SLIST_FOREACH(iet, &td->td_epochs, et_tlink)
  262                 printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name,
  263                     iet->et_file, iet->et_line);
  264 }
  265 #endif /* EPOCH_TRACE */
  266 
  267 static void
  268 epoch_init(void *arg __unused)
  269 {
  270         int cpu;
  271 
  272         block_count = counter_u64_alloc(M_WAITOK);
  273         migrate_count = counter_u64_alloc(M_WAITOK);
  274         turnstile_count = counter_u64_alloc(M_WAITOK);
  275         switch_count = counter_u64_alloc(M_WAITOK);
  276         epoch_call_count = counter_u64_alloc(M_WAITOK);
  277         epoch_call_task_count = counter_u64_alloc(M_WAITOK);
  278 
  279         pcpu_zone_record = uma_zcreate("epoch_record pcpu",
  280             sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
  281             UMA_ALIGN_PTR, UMA_ZONE_PCPU);
  282         CPU_FOREACH(cpu) {
  283                 GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0,
  284                     epoch_call_task, NULL);
  285                 taskqgroup_attach_cpu(qgroup_softirq,
  286                     DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL,
  287                     "epoch call task");
  288         }
  289 #ifdef EPOCH_TRACE
  290         SLIST_INIT(&thread0.td_epochs);
  291 #endif
  292         sx_init(&epoch_sx, "epoch-sx");
  293         inited = 1;
  294         global_epoch = epoch_alloc("Global", 0);
  295         global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT);
  296 }
  297 SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL);
  298 
  299 #if !defined(EARLY_AP_STARTUP)
  300 static void
  301 epoch_init_smp(void *dummy __unused)
  302 {
  303         inited = 2;
  304 }
  305 SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL);
  306 #endif
  307 
  308 static void
  309 epoch_ctor(epoch_t epoch)
  310 {
  311         epoch_record_t er;
  312         int cpu;
  313 
  314         epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK);
  315         CPU_FOREACH(cpu) {
  316                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
  317                 bzero(er, sizeof(*er));
  318                 ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
  319                 TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
  320                 er->er_cpuid = cpu;
  321                 er->er_parent = epoch;
  322         }
  323 }
  324 
  325 static void
  326 epoch_adjust_prio(struct thread *td, u_char prio)
  327 {
  328 
  329         thread_lock(td);
  330         sched_prio(td, prio);
  331         thread_unlock(td);
  332 }
  333 
  334 epoch_t
  335 epoch_alloc(const char *name, int flags)
  336 {
  337         epoch_t epoch;
  338         int i;
  339 
  340         MPASS(name != NULL);
  341 
  342         if (__predict_false(!inited))
  343                 panic("%s called too early in boot", __func__);
  344 
  345         EPOCH_LOCK();
  346 
  347         /*
  348          * Find a free index in the epoch array. If no free index is
  349          * found, try to use the index after the last one.
  350          */
  351         for (i = 0;; i++) {
  352                 /*
  353                  * If too many epochs are currently allocated,
  354                  * return NULL.
  355                  */
  356                 if (i == MAX_EPOCHS) {
  357                         epoch = NULL;
  358                         goto done;
  359                 }
  360                 if (epoch_array[i].e_in_use == 0)
  361                         break;
  362         }
  363 
  364         epoch = epoch_array + i;
  365         ck_epoch_init(&epoch->e_epoch);
  366         epoch_ctor(epoch);
  367         epoch->e_flags = flags;
  368         epoch->e_name = name;
  369         sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
  370         mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
  371 
  372         /*
  373          * Set e_in_use last, because when this field is set the
  374          * epoch_call_task() function will start scanning this epoch
  375          * structure.
  376          */
  377         atomic_store_rel_int(&epoch->e_in_use, 1);
  378 done:
  379         EPOCH_UNLOCK();
  380         return (epoch);
  381 }
  382 
  383 void
  384 epoch_free(epoch_t epoch)
  385 {
  386 #ifdef INVARIANTS
  387         int cpu;
  388 #endif
  389 
  390         EPOCH_LOCK();
  391 
  392         MPASS(epoch->e_in_use != 0);
  393 
  394         epoch_drain_callbacks(epoch);
  395 
  396         atomic_store_rel_int(&epoch->e_in_use, 0);
  397         /*
  398          * Make sure the epoch_call_task() function see e_in_use equal
  399          * to zero, by calling epoch_wait() on the global_epoch:
  400          */
  401         epoch_wait(global_epoch);
  402 #ifdef INVARIANTS
  403         CPU_FOREACH(cpu) {
  404                 epoch_record_t er;
  405 
  406                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
  407 
  408                 /*
  409                  * Sanity check: none of the records should be in use anymore.
  410                  * We drained callbacks above and freeing the pcpu records is
  411                  * imminent.
  412                  */
  413                 MPASS(er->er_td == NULL);
  414                 MPASS(TAILQ_EMPTY(&er->er_tdlist));
  415         }
  416 #endif
  417         uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
  418         mtx_destroy(&epoch->e_drain_mtx);
  419         sx_destroy(&epoch->e_drain_sx);
  420         memset(epoch, 0, sizeof(*epoch));
  421 
  422         EPOCH_UNLOCK();
  423 }
  424 
  425 static epoch_record_t
  426 epoch_currecord(epoch_t epoch)
  427 {
  428 
  429         return (zpcpu_get(epoch->e_pcpu_record));
  430 }
  431 
  432 #define INIT_CHECK(epoch)                                       \
  433         do {                                                    \
  434                 if (__predict_false((epoch) == NULL))           \
  435                         return;                                 \
  436         } while (0)
  437 
  438 void
  439 _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
  440 {
  441         struct epoch_record *er;
  442         struct thread *td;
  443 
  444         MPASS(cold || epoch != NULL);
  445         td = curthread;
  446         MPASS((vm_offset_t)et >= td->td_kstack &&
  447             (vm_offset_t)et + sizeof(struct epoch_tracker) <=
  448             td->td_kstack + td->td_kstack_pages * PAGE_SIZE);
  449 
  450         INIT_CHECK(epoch);
  451         MPASS(epoch->e_flags & EPOCH_PREEMPT);
  452 
  453 #ifdef EPOCH_TRACE
  454         epoch_trace_enter(td, epoch, et, file, line);
  455 #endif
  456         et->et_td = td;
  457         THREAD_NO_SLEEPING();
  458         critical_enter();
  459         sched_pin();
  460         et->et_old_priority = td->td_priority;
  461         er = epoch_currecord(epoch);
  462         /* Record-level tracking is reserved for non-preemptible epochs. */
  463         MPASS(er->er_td == NULL);
  464         TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
  465         ck_epoch_begin(&er->er_record, &et->et_section);
  466         critical_exit();
  467 }
  468 
  469 void
  470 epoch_enter(epoch_t epoch)
  471 {
  472         epoch_record_t er;
  473 
  474         MPASS(cold || epoch != NULL);
  475         INIT_CHECK(epoch);
  476         critical_enter();
  477         er = epoch_currecord(epoch);
  478 #ifdef INVARIANTS
  479         if (er->er_record.active == 0) {
  480                 MPASS(er->er_td == NULL);
  481                 er->er_td = curthread;
  482         } else {
  483                 /* We've recursed, just make sure our accounting isn't wrong. */
  484                 MPASS(er->er_td == curthread);
  485         }
  486 #endif
  487         ck_epoch_begin(&er->er_record, NULL);
  488 }
  489 
  490 void
  491 _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
  492 {
  493         struct epoch_record *er;
  494         struct thread *td;
  495 
  496         INIT_CHECK(epoch);
  497         td = curthread;
  498         critical_enter();
  499         sched_unpin();
  500         THREAD_SLEEPING_OK();
  501         er = epoch_currecord(epoch);
  502         MPASS(epoch->e_flags & EPOCH_PREEMPT);
  503         MPASS(et != NULL);
  504         MPASS(et->et_td == td);
  505 #ifdef INVARIANTS
  506         et->et_td = (void*)0xDEADBEEF;
  507         /* Record-level tracking is reserved for non-preemptible epochs. */
  508         MPASS(er->er_td == NULL);
  509 #endif
  510         ck_epoch_end(&er->er_record, &et->et_section);
  511         TAILQ_REMOVE(&er->er_tdlist, et, et_link);
  512         er->er_gen++;
  513         if (__predict_false(et->et_old_priority != td->td_priority))
  514                 epoch_adjust_prio(td, et->et_old_priority);
  515         critical_exit();
  516 #ifdef EPOCH_TRACE
  517         epoch_trace_exit(td, epoch, et, file, line);
  518 #endif
  519 }
  520 
  521 void
  522 epoch_exit(epoch_t epoch)
  523 {
  524         epoch_record_t er;
  525 
  526         INIT_CHECK(epoch);
  527         er = epoch_currecord(epoch);
  528         ck_epoch_end(&er->er_record, NULL);
  529 #ifdef INVARIANTS
  530         MPASS(er->er_td == curthread);
  531         if (er->er_record.active == 0)
  532                 er->er_td = NULL;
  533 #endif
  534         critical_exit();
  535 }
  536 
  537 /*
  538  * epoch_block_handler_preempt() is a callback from the CK code when another
  539  * thread is currently in an epoch section.
  540  */
  541 static void
  542 epoch_block_handler_preempt(struct ck_epoch *global __unused,
  543     ck_epoch_record_t *cr, void *arg __unused)
  544 {
  545         epoch_record_t record;
  546         struct thread *td, *owner, *curwaittd;
  547         struct epoch_tracker *tdwait;
  548         struct turnstile *ts;
  549         struct lock_object *lock;
  550         int spincount, gen;
  551         int locksheld __unused;
  552 
  553         record = __containerof(cr, struct epoch_record, er_record);
  554         td = curthread;
  555         locksheld = td->td_locks;
  556         spincount = 0;
  557         counter_u64_add(block_count, 1);
  558         /*
  559          * We lost a race and there's no longer any threads
  560          * on the CPU in an epoch section.
  561          */
  562         if (TAILQ_EMPTY(&record->er_tdlist))
  563                 return;
  564 
  565         if (record->er_cpuid != curcpu) {
  566                 /*
  567                  * If the head of the list is running, we can wait for it
  568                  * to remove itself from the list and thus save us the
  569                  * overhead of a migration
  570                  */
  571                 gen = record->er_gen;
  572                 thread_unlock(td);
  573                 /*
  574                  * We can't actually check if the waiting thread is running
  575                  * so we simply poll for it to exit before giving up and
  576                  * migrating.
  577                  */
  578                 do {
  579                         cpu_spinwait();
  580                 } while (!TAILQ_EMPTY(&record->er_tdlist) &&
  581                                  gen == record->er_gen &&
  582                                  spincount++ < MAX_ADAPTIVE_SPIN);
  583                 thread_lock(td);
  584                 /*
  585                  * If the generation has changed we can poll again
  586                  * otherwise we need to migrate.
  587                  */
  588                 if (gen != record->er_gen)
  589                         return;
  590                 /*
  591                  * Being on the same CPU as that of the record on which
  592                  * we need to wait allows us access to the thread
  593                  * list associated with that CPU. We can then examine the
  594                  * oldest thread in the queue and wait on its turnstile
  595                  * until it resumes and so on until a grace period
  596                  * elapses.
  597                  *
  598                  */
  599                 counter_u64_add(migrate_count, 1);
  600                 sched_bind(td, record->er_cpuid);
  601                 /*
  602                  * At this point we need to return to the ck code
  603                  * to scan to see if a grace period has elapsed.
  604                  * We can't move on to check the thread list, because
  605                  * in the meantime new threads may have arrived that
  606                  * in fact belong to a different epoch.
  607                  */
  608                 return;
  609         }
  610         /*
  611          * Try to find a thread in an epoch section on this CPU
  612          * waiting on a turnstile. Otherwise find the lowest
  613          * priority thread (highest prio value) and drop our priority
  614          * to match to allow it to run.
  615          */
  616         TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
  617                 /*
  618                  * Propagate our priority to any other waiters to prevent us
  619                  * from starving them. They will have their original priority
  620                  * restore on exit from epoch_wait().
  621                  */
  622                 curwaittd = tdwait->et_td;
  623                 if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) {
  624                         critical_enter();
  625                         thread_unlock(td);
  626                         thread_lock(curwaittd);
  627                         sched_prio(curwaittd, td->td_priority);
  628                         thread_unlock(curwaittd);
  629                         thread_lock(td);
  630                         critical_exit();
  631                 }
  632                 if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
  633                     ((ts = curwaittd->td_blocked) != NULL)) {
  634                         /*
  635                          * We unlock td to allow turnstile_wait to reacquire
  636                          * the thread lock. Before unlocking it we enter a
  637                          * critical section to prevent preemption after we
  638                          * reenable interrupts by dropping the thread lock in
  639                          * order to prevent curwaittd from getting to run.
  640                          */
  641                         critical_enter();
  642                         thread_unlock(td);
  643 
  644                         if (turnstile_lock(ts, &lock, &owner)) {
  645                                 if (ts == curwaittd->td_blocked) {
  646                                         MPASS(TD_IS_INHIBITED(curwaittd) &&
  647                                             TD_ON_LOCK(curwaittd));
  648                                         critical_exit();
  649                                         turnstile_wait(ts, owner,
  650                                             curwaittd->td_tsqueue);
  651                                         counter_u64_add(turnstile_count, 1);
  652                                         thread_lock(td);
  653                                         return;
  654                                 }
  655                                 turnstile_unlock(ts, lock);
  656                         }
  657                         thread_lock(td);
  658                         critical_exit();
  659                         KASSERT(td->td_locks == locksheld,
  660                             ("%d extra locks held", td->td_locks - locksheld));
  661                 }
  662         }
  663         /*
  664          * We didn't find any threads actually blocked on a lock
  665          * so we have nothing to do except context switch away.
  666          */
  667         counter_u64_add(switch_count, 1);
  668         mi_switch(SW_VOL | SWT_RELINQUISH);
  669         /*
  670          * It is important the thread lock is dropped while yielding
  671          * to allow other threads to acquire the lock pointed to by
  672          * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the
  673          * thread lock before returning. Else a deadlock like
  674          * situation might happen.
  675          */
  676         thread_lock(td);
  677 }
  678 
  679 void
  680 epoch_wait_preempt(epoch_t epoch)
  681 {
  682         struct thread *td;
  683         int was_bound;
  684         int old_cpu;
  685         int old_pinned;
  686         u_char old_prio;
  687         int locks __unused;
  688 
  689         MPASS(cold || epoch != NULL);
  690         INIT_CHECK(epoch);
  691         td = curthread;
  692 #ifdef INVARIANTS
  693         locks = curthread->td_locks;
  694         MPASS(epoch->e_flags & EPOCH_PREEMPT);
  695         if ((epoch->e_flags & EPOCH_LOCKED) == 0)
  696                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  697                     "epoch_wait() can be long running");
  698         KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle "
  699             "of an epoch section of the same epoch"));
  700 #endif
  701         DROP_GIANT();
  702         thread_lock(td);
  703 
  704         old_cpu = PCPU_GET(cpuid);
  705         old_pinned = td->td_pinned;
  706         old_prio = td->td_priority;
  707         was_bound = sched_is_bound(td);
  708         sched_unbind(td);
  709         td->td_pinned = 0;
  710         sched_bind(td, old_cpu);
  711 
  712         ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
  713             NULL);
  714 
  715         /* restore CPU binding, if any */
  716         if (was_bound != 0) {
  717                 sched_bind(td, old_cpu);
  718         } else {
  719                 /* get thread back to initial CPU, if any */
  720                 if (old_pinned != 0)
  721                         sched_bind(td, old_cpu);
  722                 sched_unbind(td);
  723         }
  724         /* restore pinned after bind */
  725         td->td_pinned = old_pinned;
  726 
  727         /* restore thread priority */
  728         sched_prio(td, old_prio);
  729         thread_unlock(td);
  730         PICKUP_GIANT();
  731         KASSERT(td->td_locks == locks,
  732             ("%d residual locks held", td->td_locks - locks));
  733 }
  734 
  735 static void
  736 epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
  737     void *arg __unused)
  738 {
  739         cpu_spinwait();
  740 }
  741 
  742 void
  743 epoch_wait(epoch_t epoch)
  744 {
  745 
  746         MPASS(cold || epoch != NULL);
  747         INIT_CHECK(epoch);
  748         MPASS(epoch->e_flags == 0);
  749         critical_enter();
  750         ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
  751         critical_exit();
  752 }
  753 
  754 void
  755 epoch_call(epoch_t epoch, epoch_callback_t callback, epoch_context_t ctx)
  756 {
  757         epoch_record_t er;
  758         ck_epoch_entry_t *cb;
  759 
  760         cb = (void *)ctx;
  761 
  762         MPASS(callback);
  763         /* too early in boot to have epoch set up */
  764         if (__predict_false(epoch == NULL))
  765                 goto boottime;
  766 #if !defined(EARLY_AP_STARTUP)
  767         if (__predict_false(inited < 2))
  768                 goto boottime;
  769 #endif
  770 
  771         critical_enter();
  772         *DPCPU_PTR(epoch_cb_count) += 1;
  773         er = epoch_currecord(epoch);
  774         ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback);
  775         critical_exit();
  776         return;
  777 boottime:
  778         callback(ctx);
  779 }
  780 
  781 static void
  782 epoch_call_task(void *arg __unused)
  783 {
  784         ck_stack_entry_t *cursor, *head, *next;
  785         ck_epoch_record_t *record;
  786         epoch_record_t er;
  787         epoch_t epoch;
  788         ck_stack_t cb_stack;
  789         int i, npending, total;
  790 
  791         ck_stack_init(&cb_stack);
  792         critical_enter();
  793         epoch_enter(global_epoch);
  794         for (total = i = 0; i != MAX_EPOCHS; i++) {
  795                 epoch = epoch_array + i;
  796                 if (__predict_false(
  797                     atomic_load_acq_int(&epoch->e_in_use) == 0))
  798                         continue;
  799                 er = epoch_currecord(epoch);
  800                 record = &er->er_record;
  801                 if ((npending = record->n_pending) == 0)
  802                         continue;
  803                 ck_epoch_poll_deferred(record, &cb_stack);
  804                 total += npending - record->n_pending;
  805         }
  806         epoch_exit(global_epoch);
  807         *DPCPU_PTR(epoch_cb_count) -= total;
  808         critical_exit();
  809 
  810         counter_u64_add(epoch_call_count, total);
  811         counter_u64_add(epoch_call_task_count, 1);
  812 
  813         head = ck_stack_batch_pop_npsc(&cb_stack);
  814         for (cursor = head; cursor != NULL; cursor = next) {
  815                 struct ck_epoch_entry *entry =
  816                     ck_epoch_entry_container(cursor);
  817 
  818                 next = CK_STACK_NEXT(cursor);
  819                 entry->function(entry);
  820         }
  821 }
  822 
  823 static int
  824 in_epoch_verbose_preempt(epoch_t epoch, int dump_onfail)
  825 {
  826         epoch_record_t er;
  827         struct epoch_tracker *tdwait;
  828         struct thread *td;
  829 
  830         MPASS(epoch != NULL);
  831         MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0);
  832         td = curthread;
  833         if (THREAD_CAN_SLEEP())
  834                 return (0);
  835         critical_enter();
  836         er = epoch_currecord(epoch);
  837         TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
  838                 if (tdwait->et_td == td) {
  839                         critical_exit();
  840                         return (1);
  841                 }
  842 #ifdef INVARIANTS
  843         if (dump_onfail) {
  844                 MPASS(td->td_pinned);
  845                 printf("cpu: %d id: %d\n", curcpu, td->td_tid);
  846                 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
  847                         printf("td_tid: %d ", tdwait->et_td->td_tid);
  848                 printf("\n");
  849         }
  850 #endif
  851         critical_exit();
  852         return (0);
  853 }
  854 
  855 #ifdef INVARIANTS
  856 static void
  857 epoch_assert_nocpu(epoch_t epoch, struct thread *td)
  858 {
  859         epoch_record_t er;
  860         int cpu;
  861         bool crit;
  862 
  863         crit = td->td_critnest > 0;
  864 
  865         /* Check for a critical section mishap. */
  866         CPU_FOREACH(cpu) {
  867                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
  868                 KASSERT(er->er_td != td,
  869                     ("%s critical section in epoch '%s', from cpu %d",
  870                     (crit ? "exited" : "re-entered"), epoch->e_name, cpu));
  871         }
  872 }
  873 #else
  874 #define epoch_assert_nocpu(e, td) do {} while (0)
  875 #endif
  876 
  877 int
  878 in_epoch_verbose(epoch_t epoch, int dump_onfail)
  879 {
  880         epoch_record_t er;
  881         struct thread *td;
  882 
  883         if (__predict_false((epoch) == NULL))
  884                 return (0);
  885         if ((epoch->e_flags & EPOCH_PREEMPT) != 0)
  886                 return (in_epoch_verbose_preempt(epoch, dump_onfail));
  887 
  888         /*
  889          * The thread being in a critical section is a necessary
  890          * condition to be correctly inside a non-preemptible epoch,
  891          * so it's definitely not in this epoch.
  892          */
  893         td = curthread;
  894         if (td->td_critnest == 0) {
  895                 epoch_assert_nocpu(epoch, td);
  896                 return (0);
  897         }
  898 
  899         /*
  900          * The current cpu is in a critical section, so the epoch record will be
  901          * stable for the rest of this function.  Knowing that the record is not
  902          * active is sufficient for knowing whether we're in this epoch or not,
  903          * since it's a pcpu record.
  904          */
  905         er = epoch_currecord(epoch);
  906         if (er->er_record.active == 0) {
  907                 epoch_assert_nocpu(epoch, td);
  908                 return (0);
  909         }
  910 
  911         MPASS(er->er_td == td);
  912         return (1);
  913 }
  914 
  915 int
  916 in_epoch(epoch_t epoch)
  917 {
  918         return (in_epoch_verbose(epoch, 0));
  919 }
  920 
  921 static void
  922 epoch_drain_cb(struct epoch_context *ctx)
  923 {
  924         struct epoch *epoch =
  925             __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
  926 
  927         if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
  928                 mtx_lock(&epoch->e_drain_mtx);
  929                 wakeup(epoch);
  930                 mtx_unlock(&epoch->e_drain_mtx);
  931         }
  932 }
  933 
  934 void
  935 epoch_drain_callbacks(epoch_t epoch)
  936 {
  937         epoch_record_t er;
  938         struct thread *td;
  939         int was_bound;
  940         int old_pinned;
  941         int old_cpu;
  942         int cpu;
  943 
  944         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  945             "epoch_drain_callbacks() may sleep!");
  946 
  947         /* too early in boot to have epoch set up */
  948         if (__predict_false(epoch == NULL))
  949                 return;
  950 #if !defined(EARLY_AP_STARTUP)
  951         if (__predict_false(inited < 2))
  952                 return;
  953 #endif
  954         DROP_GIANT();
  955 
  956         sx_xlock(&epoch->e_drain_sx);
  957         mtx_lock(&epoch->e_drain_mtx);
  958 
  959         td = curthread;
  960         thread_lock(td);
  961         old_cpu = PCPU_GET(cpuid);
  962         old_pinned = td->td_pinned;
  963         was_bound = sched_is_bound(td);
  964         sched_unbind(td);
  965         td->td_pinned = 0;
  966 
  967         CPU_FOREACH(cpu)
  968                 epoch->e_drain_count++;
  969         CPU_FOREACH(cpu) {
  970                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
  971                 sched_bind(td, cpu);
  972                 epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
  973         }
  974 
  975         /* restore CPU binding, if any */
  976         if (was_bound != 0) {
  977                 sched_bind(td, old_cpu);
  978         } else {
  979                 /* get thread back to initial CPU, if any */
  980                 if (old_pinned != 0)
  981                         sched_bind(td, old_cpu);
  982                 sched_unbind(td);
  983         }
  984         /* restore pinned after bind */
  985         td->td_pinned = old_pinned;
  986 
  987         thread_unlock(td);
  988 
  989         while (epoch->e_drain_count != 0)
  990                 msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
  991 
  992         mtx_unlock(&epoch->e_drain_mtx);
  993         sx_xunlock(&epoch->e_drain_sx);
  994 
  995         PICKUP_GIANT();
  996 }

Cache object: 067a2fd634e669669c1b8e0ff3f63116


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.