The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/nfs/nfs_fha.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  */
   27 
   28 #include <sys/cdefs.h>
   29 __FBSDID("$FreeBSD: releng/12.0/sys/nfs/nfs_fha.c 327173 2017-12-25 04:48:39Z kan $");
   30 
   31 #include <sys/param.h>
   32 #include <sys/systm.h>
   33 #include <sys/sysproto.h>
   34 #include <sys/kernel.h>
   35 #include <sys/sysctl.h>
   36 #include <sys/vnode.h>
   37 #include <sys/malloc.h>
   38 #include <sys/mount.h>
   39 #include <sys/mbuf.h>
   40 #include <sys/sbuf.h>
   41 
   42 #include <rpc/rpc.h>
   43 #include <nfs/nfs_fha.h>
   44 
   45 static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
   46 
   47 /*
   48  * XXX need to commonize definitions between old and new NFS code.  Define
   49  * this here so we don't include one nfsproto.h over the other.
   50  */
   51 #define NFS_PROG                100003
   52 
   53 void
   54 fha_init(struct fha_params *softc)
   55 {
   56         int i;
   57 
   58         for (i = 0; i < FHA_HASH_SIZE; i++)
   59                 mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);
   60 
   61         /*
   62          * Set the default tuning parameters.
   63          */
   64         softc->ctls.enable = FHA_DEF_ENABLE;
   65         softc->ctls.read = FHA_DEF_READ;
   66         softc->ctls.write = FHA_DEF_WRITE;
   67         softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
   68         softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
   69         softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
   70 
   71         /*
   72          * Add sysctls so the user can change the tuning parameters.
   73          */
   74         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   75             OID_AUTO, "enable", CTLFLAG_RWTUN,
   76             &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
   77 
   78         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   79             OID_AUTO, "read", CTLFLAG_RWTUN,
   80             &softc->ctls.read, 0, "Enable NFS FHA read locality");
   81 
   82         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   83             OID_AUTO, "write", CTLFLAG_RWTUN,
   84             &softc->ctls.write, 0, "Enable NFS FHA write locality");
   85 
   86         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   87             OID_AUTO, "bin_shift", CTLFLAG_RWTUN,
   88             &softc->ctls.bin_shift, 0, "Maximum locality distance 2^(bin_shift) bytes");
   89 
   90         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   91             OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RWTUN,
   92             &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
   93             "should be working on requests for the same file handle");
   94 
   95         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   96             OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RWTUN,
   97             &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
   98             "single nfsd thread should be working on at any time");
   99 
  100         SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
  101             OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
  102             softc->callbacks.fhe_stats_sysctl, "A", "");
  103 
  104 }
  105 
  106 void
  107 fha_uninit(struct fha_params *softc)
  108 {
  109         int i;
  110 
  111         sysctl_ctx_free(&softc->sysctl_ctx);
  112         for (i = 0; i < FHA_HASH_SIZE; i++)
  113                 mtx_destroy(&softc->fha_hash[i].mtx);
  114 }
  115 
  116 /*
  117  * This just specifies that offsets should obey affinity when within
  118  * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
  119  */
  120 static void
  121 fha_extract_info(struct svc_req *req, struct fha_info *i,
  122     struct fha_callbacks *cb)
  123 {
  124         struct mbuf *md;
  125         caddr_t dpos;
  126         static u_int64_t random_fh = 0;
  127         int error;
  128         int v3 = (req->rq_vers == 3);
  129         rpcproc_t procnum;
  130 
  131         /*
  132          * We start off with a random fh.  If we get a reasonable
  133          * procnum, we set the fh.  If there's a concept of offset
  134          * that we're interested in, we set that.
  135          */
  136         i->fh = ++random_fh;
  137         i->offset = 0;
  138         i->locktype = LK_EXCLUSIVE;
  139         i->read = i->write = 0;
  140 
  141         /*
  142          * Extract the procnum and convert to v3 form if necessary,
  143          * taking care to deal with out-of-range procnums.  Caller will
  144          * ensure that rq_vers is either 2 or 3.
  145          */
  146         procnum = req->rq_proc;
  147         if (!v3) {
  148                 rpcproc_t tmp_procnum;
  149 
  150                 tmp_procnum = cb->get_procnum(procnum);
  151                 if (tmp_procnum == -1)
  152                         goto out;
  153                 procnum = tmp_procnum;
  154         }
  155 
  156         /*
  157          * We do affinity for most.  However, we divide a realm of affinity
  158          * by file offset so as to allow for concurrent random access.  We
  159          * only do this for reads today, but this may change when IFS supports
  160          * efficient concurrent writes.
  161          */
  162         if (cb->no_offset(procnum))
  163                 goto out;
  164 
  165         i->read = cb->is_read(procnum);
  166         i->write = cb->is_write(procnum);
  167 
  168         error = cb->realign(&req->rq_args, M_NOWAIT);
  169         if (error)
  170                 goto out;
  171         md = req->rq_args;
  172         dpos = mtod(md, caddr_t);
  173 
  174         /* Grab the filehandle. */
  175         error = cb->get_fh(&i->fh, v3, &md, &dpos);
  176         if (error)
  177                 goto out;
  178 
  179         /* Content ourselves with zero offset for all but reads. */
  180         if (i->read || i->write)
  181                 cb->get_offset(&md, &dpos, v3, i);
  182 
  183 out:
  184         cb->set_locktype(procnum, i);
  185 }
  186 
  187 static struct fha_hash_entry *
  188 fha_hash_entry_new(u_int64_t fh)
  189 {
  190         struct fha_hash_entry *e;
  191 
  192         e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
  193         e->fh = fh;
  194         e->num_rw = 0;
  195         e->num_exclusive = 0;
  196         e->num_threads = 0;
  197         LIST_INIT(&e->threads);
  198 
  199         return (e);
  200 }
  201 
  202 static void
  203 fha_hash_entry_destroy(struct fha_hash_entry *e)
  204 {
  205 
  206         mtx_assert(e->mtx, MA_OWNED);
  207         KASSERT(e->num_rw == 0,
  208             ("%d reqs on destroyed fhe %p", e->num_rw, e));
  209         KASSERT(e->num_exclusive == 0,
  210             ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
  211         KASSERT(e->num_threads == 0,
  212             ("%d threads on destroyed fhe %p", e->num_threads, e));
  213         free(e, M_NFS_FHA);
  214 }
  215 
  216 static void
  217 fha_hash_entry_remove(struct fha_hash_entry *e)
  218 {
  219 
  220         mtx_assert(e->mtx, MA_OWNED);
  221         LIST_REMOVE(e, link);
  222         fha_hash_entry_destroy(e);
  223 }
  224 
  225 static struct fha_hash_entry *
  226 fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
  227 {
  228         struct fha_hash_slot *fhs;
  229         struct fha_hash_entry *fhe, *new_fhe;
  230 
  231         fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
  232         new_fhe = fha_hash_entry_new(fh);
  233         new_fhe->mtx = &fhs->mtx;
  234         mtx_lock(&fhs->mtx);
  235         LIST_FOREACH(fhe, &fhs->list, link)
  236                 if (fhe->fh == fh)
  237                         break;
  238         if (!fhe) {
  239                 fhe = new_fhe;
  240                 LIST_INSERT_HEAD(&fhs->list, fhe, link);
  241         } else
  242                 fha_hash_entry_destroy(new_fhe);
  243         return (fhe);
  244 }
  245 
  246 static void
  247 fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  248 {
  249 
  250         mtx_assert(fhe->mtx, MA_OWNED);
  251         thread->st_p2 = 0;
  252         LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
  253         fhe->num_threads++;
  254 }
  255 
  256 static void
  257 fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  258 {
  259 
  260         mtx_assert(fhe->mtx, MA_OWNED);
  261         KASSERT(thread->st_p2 == 0,
  262             ("%d reqs on removed thread %p", thread->st_p2, thread));
  263         LIST_REMOVE(thread, st_alink);
  264         fhe->num_threads--;
  265 }
  266 
  267 /*
  268  * Account for an ongoing operation associated with this file.
  269  */
  270 static void
  271 fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
  272 {
  273 
  274         mtx_assert(fhe->mtx, MA_OWNED);
  275         if (LK_EXCLUSIVE == locktype)
  276                 fhe->num_exclusive += count;
  277         else
  278                 fhe->num_rw += count;
  279 }
  280 
  281 /*
  282  * Get the service thread currently associated with the fhe that is
  283  * appropriate to handle this operation.
  284  */
  285 static SVCTHREAD *
  286 fha_hash_entry_choose_thread(struct fha_params *softc,
  287     struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
  288 {
  289         SVCTHREAD *thread, *min_thread = NULL;
  290         int req_count, min_count = 0;
  291         off_t offset1, offset2;
  292 
  293         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  294                 req_count = thread->st_p2;
  295 
  296                 /* If there are any writes in progress, use the first thread. */
  297                 if (fhe->num_exclusive) {
  298 #if 0
  299                         ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  300                             "fha: %p(%d)w", thread, req_count);
  301 #endif
  302                         return (thread);
  303                 }
  304 
  305                 /* Check whether we should consider locality. */
  306                 if ((i->read && !softc->ctls.read) ||
  307                     (i->write && !softc->ctls.write))
  308                         goto noloc;
  309 
  310                 /*
  311                  * Check for locality, making sure that we won't
  312                  * exceed our per-thread load limit in the process.
  313                  */
  314                 offset1 = i->offset;
  315                 offset2 = thread->st_p3;
  316 
  317                 if (((offset1 >= offset2)
  318                   && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
  319                  || ((offset2 > offset1)
  320                   && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
  321                         if ((softc->ctls.max_reqs_per_nfsd == 0) ||
  322                             (req_count < softc->ctls.max_reqs_per_nfsd)) {
  323 #if 0
  324                                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  325                                     "fha: %p(%d)r", thread, req_count);
  326 #endif
  327                                 return (thread);
  328                         }
  329                 }
  330 
  331 noloc:
  332                 /*
  333                  * We don't have a locality match, so skip this thread,
  334                  * but keep track of the most attractive thread in case
  335                  * we need to come back to it later.
  336                  */
  337 #if 0
  338                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  339                     "fha: %p(%d)s off1 %llu off2 %llu", thread,
  340                     req_count, offset1, offset2);
  341 #endif
  342                 if ((min_thread == NULL) || (req_count < min_count)) {
  343                         min_count = req_count;
  344                         min_thread = thread;
  345                 }
  346         }
  347 
  348         /*
  349          * We didn't find a good match yet.  See if we can add
  350          * a new thread to this file handle entry's thread list.
  351          */
  352         if ((softc->ctls.max_nfsds_per_fh == 0) ||
  353             (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
  354                 thread = this_thread;
  355 #if 0
  356                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  357                     "fha: %p(%d)t", thread, thread->st_p2);
  358 #endif
  359                 fha_hash_entry_add_thread(fhe, thread);
  360         } else {
  361                 /*
  362                  * We don't want to use any more threads for this file, so
  363                  * go back to the most attractive nfsd we're already using.
  364                  */
  365                 thread = min_thread;
  366         }
  367 
  368         return (thread);
  369 }
  370 
  371 /*
  372  * After getting a request, try to assign it to some thread.  Usually we
  373  * handle it ourselves.
  374  */
  375 SVCTHREAD *
  376 fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
  377     struct fha_params *softc)
  378 {
  379         SVCTHREAD *thread;
  380         struct fha_info i;
  381         struct fha_hash_entry *fhe;
  382         struct fha_callbacks *cb;
  383 
  384         cb = &softc->callbacks;
  385 
  386         /* Check to see whether we're enabled. */
  387         if (softc->ctls.enable == 0)
  388                 goto thist;
  389 
  390         /*
  391          * Only do placement if this is an NFS request.
  392          */
  393         if (req->rq_prog != NFS_PROG)
  394                 goto thist;
  395 
  396         if (req->rq_vers != 2 && req->rq_vers != 3)
  397                 goto thist;
  398 
  399         fha_extract_info(req, &i, cb);
  400 
  401         /*
  402          * We save the offset associated with this request for later
  403          * nfsd matching.
  404          */
  405         fhe = fha_hash_entry_lookup(softc, i.fh);
  406         req->rq_p1 = fhe;
  407         req->rq_p2 = i.locktype;
  408         req->rq_p3 = i.offset;
  409 
  410         /*
  411          * Choose a thread, taking into consideration locality, thread load,
  412          * and the number of threads already working on this file.
  413          */
  414         thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
  415         KASSERT(thread, ("fha_assign: NULL thread!"));
  416         fha_hash_entry_add_op(fhe, i.locktype, 1);
  417         thread->st_p2++;
  418         thread->st_p3 = i.offset;
  419 
  420         /*
  421          * Grab the pool lock here to not let chosen thread go away before
  422          * the new request inserted to its queue while we drop fhe lock.
  423          */
  424         mtx_lock(&thread->st_lock);
  425         mtx_unlock(fhe->mtx);
  426 
  427         return (thread);
  428 thist:
  429         req->rq_p1 = NULL;
  430         mtx_lock(&this_thread->st_lock);
  431         return (this_thread);
  432 }
  433 
  434 /*
  435  * Called when we're done with an operation.  The request has already
  436  * been de-queued.
  437  */
  438 void
  439 fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
  440 {
  441         struct fha_hash_entry *fhe = req->rq_p1;
  442         struct mtx *mtx;
  443 
  444         /*
  445          * This may be called for reqs that didn't go through
  446          * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
  447          */
  448         if (!fhe)
  449                 return;
  450 
  451         mtx = fhe->mtx;
  452         mtx_lock(mtx);
  453         fha_hash_entry_add_op(fhe, req->rq_p2, -1);
  454         thread->st_p2--;
  455         KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
  456             thread->st_p2, thread));
  457         if (thread->st_p2 == 0) {
  458                 fha_hash_entry_remove_thread(fhe, thread);
  459                 if (0 == fhe->num_rw + fhe->num_exclusive)
  460                         fha_hash_entry_remove(fhe);
  461         }
  462         mtx_unlock(mtx);
  463 }
  464 
  465 int
  466 fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
  467 {
  468         int error, i;
  469         struct sbuf sb;
  470         struct fha_hash_entry *fhe;
  471         bool_t first, hfirst;
  472         SVCTHREAD *thread;
  473 
  474         sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);
  475 
  476         if (!*softc->pool) {
  477                 sbuf_printf(&sb, "NFSD not running\n");
  478                 goto out;
  479         }
  480 
  481         for (i = 0; i < FHA_HASH_SIZE; i++)
  482                 if (!LIST_EMPTY(&softc->fha_hash[i].list))
  483                         break;
  484 
  485         if (i == FHA_HASH_SIZE) {
  486                 sbuf_printf(&sb, "No file handle entries.\n");
  487                 goto out;
  488         }
  489 
  490         hfirst = TRUE;
  491         for (; i < FHA_HASH_SIZE; i++) {
  492                 mtx_lock(&softc->fha_hash[i].mtx);
  493                 if (LIST_EMPTY(&softc->fha_hash[i].list)) {
  494                         mtx_unlock(&softc->fha_hash[i].mtx);
  495                         continue;
  496                 }
  497                 sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
  498                 first = TRUE;
  499                 LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
  500                         sbuf_printf(&sb, "%sfhe %p: {\n", first ? "  " : ", ", fhe);
  501 
  502                         sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
  503                         sbuf_printf(&sb, "    num_rw/exclusive: %d/%d\n",
  504                             fhe->num_rw, fhe->num_exclusive);
  505                         sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
  506 
  507                         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  508                                 sbuf_printf(&sb, "      thread %p offset %ju "
  509                                     "reqs %d\n", thread,
  510                                     thread->st_p3, thread->st_p2);
  511                         }
  512 
  513                         sbuf_printf(&sb, "  }");
  514                         first = FALSE;
  515                 }
  516                 sbuf_printf(&sb, "\n}");
  517                 mtx_unlock(&softc->fha_hash[i].mtx);
  518                 hfirst = FALSE;
  519         }
  520 
  521  out:
  522         sbuf_trim(&sb);
  523         sbuf_finish(&sb);
  524         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
  525         sbuf_delete(&sb);
  526         return (error);
  527 }

Cache object: 7ec9913c37e298f017a758d739900e9a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.