The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/nfs/nfs_fha.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
    3  *
    4  * Redistribution and use in source and binary forms, with or without
    5  * modification, are permitted provided that the following conditions
    6  * are met:
    7  * 1. Redistributions of source code must retain the above copyright
    8  *    notice, this list of conditions and the following disclaimer.
    9  * 2. Redistributions in binary form must reproduce the above copyright
   10  *    notice, this list of conditions and the following disclaimer in the
   11  *    documentation and/or other materials provided with the distribution.
   12  *
   13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   23  * SUCH DAMAGE.
   24  */
   25 
   26 #include <sys/cdefs.h>
   27 __FBSDID("$FreeBSD: releng/11.1/sys/nfs/nfs_fha.c 267479 2014-06-14 12:26:12Z mav $");
   28 
   29 #include <sys/param.h>
   30 #include <sys/systm.h>
   31 #include <sys/sysproto.h>
   32 #include <sys/kernel.h>
   33 #include <sys/sysctl.h>
   34 #include <sys/vnode.h>
   35 #include <sys/malloc.h>
   36 #include <sys/mount.h>
   37 #include <sys/mbuf.h>
   38 #include <sys/sbuf.h>
   39 
   40 #include <rpc/rpc.h>
   41 #include <nfs/nfs_fha.h>
   42 
   43 static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
   44 
   45 /*
   46  * XXX need to commonize definitions between old and new NFS code.  Define
   47  * this here so we don't include one nfsproto.h over the other.
   48  */
   49 #define NFS_PROG                100003
   50 
   51 void
   52 fha_init(struct fha_params *softc)
   53 {
   54         char tmpstr[128];
   55         int i;
   56 
   57         for (i = 0; i < FHA_HASH_SIZE; i++)
   58                 mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);
   59 
   60         /*
   61          * Set the default tuning parameters.
   62          */
   63         softc->ctls.enable = FHA_DEF_ENABLE;
   64         softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
   65         softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
   66         softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
   67 
   68         /*
   69          * Allow the user to override the defaults at boot time with
   70          * tunables.
   71          */
   72         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.enable",
   73             softc->server_name);
   74         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.enable);
   75         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.bin_shift",
   76             softc->server_name);
   77         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.bin_shift);
   78         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_nfsds_per_fh",
   79             softc->server_name);
   80         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_nfsds_per_fh);
   81         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_reqs_per_nfsd",
   82             softc->server_name);
   83         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_reqs_per_nfsd);
   84 
   85         /*
   86          * Add sysctls so the user can change the tuning parameters at
   87          * runtime.
   88          */
   89         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   90             OID_AUTO, "enable", CTLFLAG_RW,
   91             &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
   92 
   93         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   94             OID_AUTO, "bin_shift", CTLFLAG_RW,
   95             &softc->ctls.bin_shift, 0, "For FHA reads, no two requests will "
   96             "contend if they're 2^(bin_shift) bytes apart");
   97 
   98         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   99             OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RW,
  100             &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
  101             "should be working on requests for the same file handle");
  102 
  103         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
  104             OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RW,
  105             &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
  106             "single nfsd thread should be working on at any time");
  107 
  108         SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
  109             OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
  110             softc->callbacks.fhe_stats_sysctl, "A", "");
  111 
  112 }
  113 
  114 void
  115 fha_uninit(struct fha_params *softc)
  116 {
  117         int i;
  118 
  119         sysctl_ctx_free(&softc->sysctl_ctx);
  120         for (i = 0; i < FHA_HASH_SIZE; i++)
  121                 mtx_destroy(&softc->fha_hash[i].mtx);
  122 }
  123 
  124 /*
  125  * This just specifies that offsets should obey affinity when within
  126  * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
  127  */
  128 static void
  129 fha_extract_info(struct svc_req *req, struct fha_info *i,
  130     struct fha_callbacks *cb)
  131 {
  132         struct mbuf *md;
  133         caddr_t dpos;
  134         static u_int64_t random_fh = 0;
  135         int error;
  136         int v3 = (req->rq_vers == 3);
  137         rpcproc_t procnum;
  138 
  139         /*
  140          * We start off with a random fh.  If we get a reasonable
  141          * procnum, we set the fh.  If there's a concept of offset
  142          * that we're interested in, we set that.
  143          */
  144         i->fh = ++random_fh;
  145         i->offset = 0;
  146         i->locktype = LK_EXCLUSIVE;
  147 
  148         /*
  149          * Extract the procnum and convert to v3 form if necessary,
  150          * taking care to deal with out-of-range procnums.  Caller will
  151          * ensure that rq_vers is either 2 or 3.
  152          */
  153         procnum = req->rq_proc;
  154         if (!v3) {
  155                 rpcproc_t tmp_procnum;
  156 
  157                 tmp_procnum = cb->get_procnum(procnum);
  158                 if (tmp_procnum == -1)
  159                         goto out;
  160                 procnum = tmp_procnum;
  161         }
  162 
  163         /*
  164          * We do affinity for most.  However, we divide a realm of affinity
  165          * by file offset so as to allow for concurrent random access.  We
  166          * only do this for reads today, but this may change when IFS supports
  167          * efficient concurrent writes.
  168          */
  169         if (cb->no_offset(procnum))
  170                 goto out;
  171 
  172         error = cb->realign(&req->rq_args, M_NOWAIT);
  173         if (error)
  174                 goto out;
  175         md = req->rq_args;
  176         dpos = mtod(md, caddr_t);
  177 
  178         /* Grab the filehandle. */
  179         error = cb->get_fh(&i->fh, v3, &md, &dpos);
  180         if (error)
  181                 goto out;
  182 
  183         /* Content ourselves with zero offset for all but reads. */
  184         if (cb->is_read(procnum) || cb->is_write(procnum))
  185                 cb->get_offset(&md, &dpos, v3, i);
  186 
  187 out:
  188         cb->set_locktype(procnum, i);
  189 }
  190 
  191 static struct fha_hash_entry *
  192 fha_hash_entry_new(u_int64_t fh)
  193 {
  194         struct fha_hash_entry *e;
  195 
  196         e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
  197         e->fh = fh;
  198         e->num_rw = 0;
  199         e->num_exclusive = 0;
  200         e->num_threads = 0;
  201         LIST_INIT(&e->threads);
  202 
  203         return (e);
  204 }
  205 
  206 static void
  207 fha_hash_entry_destroy(struct fha_hash_entry *e)
  208 {
  209 
  210         mtx_assert(e->mtx, MA_OWNED);
  211         KASSERT(e->num_rw == 0,
  212             ("%d reqs on destroyed fhe %p", e->num_rw, e));
  213         KASSERT(e->num_exclusive == 0,
  214             ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
  215         KASSERT(e->num_threads == 0,
  216             ("%d threads on destroyed fhe %p", e->num_threads, e));
  217         free(e, M_NFS_FHA);
  218 }
  219 
  220 static void
  221 fha_hash_entry_remove(struct fha_hash_entry *e)
  222 {
  223 
  224         mtx_assert(e->mtx, MA_OWNED);
  225         LIST_REMOVE(e, link);
  226         fha_hash_entry_destroy(e);
  227 }
  228 
  229 static struct fha_hash_entry *
  230 fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
  231 {
  232         SVCPOOL *pool;
  233         struct fha_hash_slot *fhs;
  234         struct fha_hash_entry *fhe, *new_fhe;
  235 
  236         pool = *softc->pool;
  237         fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
  238         new_fhe = fha_hash_entry_new(fh);
  239         new_fhe->mtx = &fhs->mtx;
  240         mtx_lock(&fhs->mtx);
  241         LIST_FOREACH(fhe, &fhs->list, link)
  242                 if (fhe->fh == fh)
  243                         break;
  244         if (!fhe) {
  245                 fhe = new_fhe;
  246                 LIST_INSERT_HEAD(&fhs->list, fhe, link);
  247         } else
  248                 fha_hash_entry_destroy(new_fhe);
  249         return (fhe);
  250 }
  251 
  252 static void
  253 fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  254 {
  255 
  256         mtx_assert(fhe->mtx, MA_OWNED);
  257         thread->st_p2 = 0;
  258         LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
  259         fhe->num_threads++;
  260 }
  261 
  262 static void
  263 fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  264 {
  265 
  266         mtx_assert(fhe->mtx, MA_OWNED);
  267         KASSERT(thread->st_p2 == 0,
  268             ("%d reqs on removed thread %p", thread->st_p2, thread));
  269         LIST_REMOVE(thread, st_alink);
  270         fhe->num_threads--;
  271 }
  272 
  273 /*
  274  * Account for an ongoing operation associated with this file.
  275  */
  276 static void
  277 fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
  278 {
  279 
  280         mtx_assert(fhe->mtx, MA_OWNED);
  281         if (LK_EXCLUSIVE == locktype)
  282                 fhe->num_exclusive += count;
  283         else
  284                 fhe->num_rw += count;
  285 }
  286 
  287 /*
  288  * Get the service thread currently associated with the fhe that is
  289  * appropriate to handle this operation.
  290  */
  291 static SVCTHREAD *
  292 fha_hash_entry_choose_thread(struct fha_params *softc,
  293     struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
  294 {
  295         SVCTHREAD *thread, *min_thread = NULL;
  296         SVCPOOL *pool;
  297         int req_count, min_count = 0;
  298         off_t offset1, offset2;
  299 
  300         pool = *softc->pool;
  301 
  302         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  303                 req_count = thread->st_p2;
  304 
  305                 /* If there are any writes in progress, use the first thread. */
  306                 if (fhe->num_exclusive) {
  307 #if 0
  308                         ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  309                             "fha: %p(%d)w", thread, req_count);
  310 #endif
  311                         return (thread);
  312                 }
  313 
  314                 /*
  315                  * Check for read locality, making sure that we won't
  316                  * exceed our per-thread load limit in the process.
  317                  */
  318                 offset1 = i->offset;
  319                 offset2 = thread->st_p3;
  320 
  321                 if (((offset1 >= offset2)
  322                   && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
  323                  || ((offset2 > offset1)
  324                   && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
  325                         if ((softc->ctls.max_reqs_per_nfsd == 0) ||
  326                             (req_count < softc->ctls.max_reqs_per_nfsd)) {
  327 #if 0
  328                                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  329                                     "fha: %p(%d)r", thread, req_count);
  330 #endif
  331                                 return (thread);
  332                         }
  333                 }
  334 
  335                 /*
  336                  * We don't have a locality match, so skip this thread,
  337                  * but keep track of the most attractive thread in case
  338                  * we need to come back to it later.
  339                  */
  340 #if 0
  341                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  342                     "fha: %p(%d)s off1 %llu off2 %llu", thread,
  343                     req_count, offset1, offset2);
  344 #endif
  345                 if ((min_thread == NULL) || (req_count < min_count)) {
  346                         min_count = req_count;
  347                         min_thread = thread;
  348                 }
  349         }
  350 
  351         /*
  352          * We didn't find a good match yet.  See if we can add
  353          * a new thread to this file handle entry's thread list.
  354          */
  355         if ((softc->ctls.max_nfsds_per_fh == 0) ||
  356             (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
  357                 thread = this_thread;
  358 #if 0
  359                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  360                     "fha: %p(%d)t", thread, thread->st_p2);
  361 #endif
  362                 fha_hash_entry_add_thread(fhe, thread);
  363         } else {
  364                 /*
  365                  * We don't want to use any more threads for this file, so
  366                  * go back to the most attractive nfsd we're already using.
  367                  */
  368                 thread = min_thread;
  369         }
  370 
  371         return (thread);
  372 }
  373 
  374 /*
  375  * After getting a request, try to assign it to some thread.  Usually we
  376  * handle it ourselves.
  377  */
  378 SVCTHREAD *
  379 fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
  380     struct fha_params *softc)
  381 {
  382         SVCTHREAD *thread;
  383         struct fha_info i;
  384         struct fha_hash_entry *fhe;
  385         struct fha_callbacks *cb;
  386 
  387         cb = &softc->callbacks;
  388 
  389         /* Check to see whether we're enabled. */
  390         if (softc->ctls.enable == 0)
  391                 goto thist;
  392 
  393         /*
  394          * Only do placement if this is an NFS request.
  395          */
  396         if (req->rq_prog != NFS_PROG)
  397                 goto thist;
  398 
  399         if (req->rq_vers != 2 && req->rq_vers != 3)
  400                 goto thist;
  401 
  402         fha_extract_info(req, &i, cb);
  403 
  404         /*
  405          * We save the offset associated with this request for later
  406          * nfsd matching.
  407          */
  408         fhe = fha_hash_entry_lookup(softc, i.fh);
  409         req->rq_p1 = fhe;
  410         req->rq_p2 = i.locktype;
  411         req->rq_p3 = i.offset;
  412 
  413         /*
  414          * Choose a thread, taking into consideration locality, thread load,
  415          * and the number of threads already working on this file.
  416          */
  417         thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
  418         KASSERT(thread, ("fha_assign: NULL thread!"));
  419         fha_hash_entry_add_op(fhe, i.locktype, 1);
  420         thread->st_p2++;
  421         thread->st_p3 = i.offset;
  422 
  423         /*
  424          * Grab the pool lock here to not let chosen thread go away before
  425          * the new request inserted to its queue while we drop fhe lock.
  426          */
  427         mtx_lock(&thread->st_lock);
  428         mtx_unlock(fhe->mtx);
  429 
  430         return (thread);
  431 thist:
  432         req->rq_p1 = NULL;
  433         mtx_lock(&this_thread->st_lock);
  434         return (this_thread);
  435 }
  436 
  437 /*
  438  * Called when we're done with an operation.  The request has already
  439  * been de-queued.
  440  */
  441 void
  442 fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
  443 {
  444         struct fha_hash_entry *fhe = req->rq_p1;
  445         struct mtx *mtx;
  446 
  447         /*
  448          * This may be called for reqs that didn't go through
  449          * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
  450          */
  451         if (!fhe)
  452                 return;
  453 
  454         mtx = fhe->mtx;
  455         mtx_lock(mtx);
  456         fha_hash_entry_add_op(fhe, req->rq_p2, -1);
  457         thread->st_p2--;
  458         KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
  459             thread->st_p2, thread));
  460         if (thread->st_p2 == 0) {
  461                 fha_hash_entry_remove_thread(fhe, thread);
  462                 if (0 == fhe->num_rw + fhe->num_exclusive)
  463                         fha_hash_entry_remove(fhe);
  464         }
  465         mtx_unlock(mtx);
  466 }
  467 
  468 int
  469 fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
  470 {
  471         int error, i;
  472         struct sbuf sb;
  473         struct fha_hash_entry *fhe;
  474         bool_t first, hfirst;
  475         SVCTHREAD *thread;
  476         SVCPOOL *pool;
  477 
  478         sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);
  479 
  480         pool = NULL;
  481 
  482         if (!*softc->pool) {
  483                 sbuf_printf(&sb, "NFSD not running\n");
  484                 goto out;
  485         }
  486         pool = *softc->pool;
  487 
  488         for (i = 0; i < FHA_HASH_SIZE; i++)
  489                 if (!LIST_EMPTY(&softc->fha_hash[i].list))
  490                         break;
  491 
  492         if (i == FHA_HASH_SIZE) {
  493                 sbuf_printf(&sb, "No file handle entries.\n");
  494                 goto out;
  495         }
  496 
  497         hfirst = TRUE;
  498         for (; i < FHA_HASH_SIZE; i++) {
  499                 mtx_lock(&softc->fha_hash[i].mtx);
  500                 if (LIST_EMPTY(&softc->fha_hash[i].list)) {
  501                         mtx_unlock(&softc->fha_hash[i].mtx);
  502                         continue;
  503                 }
  504                 sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
  505                 first = TRUE;
  506                 LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
  507                         sbuf_printf(&sb, "%sfhe %p: {\n", first ? "  " : ", ", fhe);
  508 
  509                         sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
  510                         sbuf_printf(&sb, "    num_rw/exclusive: %d/%d\n",
  511                             fhe->num_rw, fhe->num_exclusive);
  512                         sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
  513 
  514                         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  515                                 sbuf_printf(&sb, "      thread %p offset %ju "
  516                                     "reqs %d\n", thread,
  517                                     thread->st_p3, thread->st_p2);
  518                         }
  519 
  520                         sbuf_printf(&sb, "  }");
  521                         first = FALSE;
  522                 }
  523                 sbuf_printf(&sb, "\n}");
  524                 mtx_unlock(&softc->fha_hash[i].mtx);
  525                 hfirst = FALSE;
  526         }
  527 
  528  out:
  529         sbuf_trim(&sb);
  530         sbuf_finish(&sb);
  531         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
  532         sbuf_delete(&sb);
  533         return (error);
  534 }

Cache object: 06237d23939719ab0bcde5590deebdd1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.