The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/nfs/nfs_fha.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
    3  *
    4  * Redistribution and use in source and binary forms, with or without
    5  * modification, are permitted provided that the following conditions
    6  * are met:
    7  * 1. Redistributions of source code must retain the above copyright
    8  *    notice, this list of conditions and the following disclaimer.
    9  * 2. Redistributions in binary form must reproduce the above copyright
   10  *    notice, this list of conditions and the following disclaimer in the
   11  *    documentation and/or other materials provided with the distribution.
   12  *
   13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   23  * SUCH DAMAGE.
   24  */
   25 
   26 #include <sys/cdefs.h>
   27 __FBSDID("$FreeBSD$");
   28 
   29 #include <sys/param.h>
   30 #include <sys/systm.h>
   31 #include <sys/sysproto.h>
   32 #include <sys/kernel.h>
   33 #include <sys/sysctl.h>
   34 #include <sys/vnode.h>
   35 #include <sys/malloc.h>
   36 #include <sys/mount.h>
   37 #include <sys/mbuf.h>
   38 #include <sys/sbuf.h>
   39 
   40 #include <rpc/rpc.h>
   41 #include <nfs/nfs_fha.h>
   42 
   43 static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
   44 
   45 /*
   46  * XXX need to commonize definitions between old and new NFS code.  Define
   47  * this here so we don't include one nfsproto.h over the other.
   48  */
   49 #define NFS_PROG                100003
   50 
   51 void
   52 fha_init(struct fha_params *softc)
   53 {
   54         int i;
   55 
   56         for (i = 0; i < FHA_HASH_SIZE; i++)
   57                 mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);
   58 
   59         /*
   60          * Set the default tuning parameters.
   61          */
   62         softc->ctls.enable = FHA_DEF_ENABLE;
   63         softc->ctls.read = FHA_DEF_READ;
   64         softc->ctls.write = FHA_DEF_WRITE;
   65         softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
   66         softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
   67         softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
   68 
   69         /*
   70          * Add sysctls so the user can change the tuning parameters.
   71          */
   72         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   73             OID_AUTO, "enable", CTLFLAG_RWTUN,
   74             &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
   75 
   76         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   77             OID_AUTO, "read", CTLFLAG_RWTUN,
   78             &softc->ctls.read, 0, "Enable NFS FHA read locality");
   79 
   80         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   81             OID_AUTO, "write", CTLFLAG_RWTUN,
   82             &softc->ctls.write, 0, "Enable NFS FHA write locality");
   83 
   84         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   85             OID_AUTO, "bin_shift", CTLFLAG_RWTUN,
   86             &softc->ctls.bin_shift, 0, "Maximum locality distance 2^(bin_shift) bytes");
   87 
   88         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   89             OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RWTUN,
   90             &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
   91             "should be working on requests for the same file handle");
   92 
   93         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   94             OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RWTUN,
   95             &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
   96             "single nfsd thread should be working on at any time");
   97 
   98         SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   99             OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
  100             softc->callbacks.fhe_stats_sysctl, "A", "");
  101 
  102 }
  103 
  104 void
  105 fha_uninit(struct fha_params *softc)
  106 {
  107         int i;
  108 
  109         sysctl_ctx_free(&softc->sysctl_ctx);
  110         for (i = 0; i < FHA_HASH_SIZE; i++)
  111                 mtx_destroy(&softc->fha_hash[i].mtx);
  112 }
  113 
  114 /*
  115  * This just specifies that offsets should obey affinity when within
  116  * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
  117  */
  118 static void
  119 fha_extract_info(struct svc_req *req, struct fha_info *i,
  120     struct fha_callbacks *cb)
  121 {
  122         struct mbuf *md;
  123         caddr_t dpos;
  124         static u_int64_t random_fh = 0;
  125         int error;
  126         int v3 = (req->rq_vers == 3);
  127         rpcproc_t procnum;
  128 
  129         /*
  130          * We start off with a random fh.  If we get a reasonable
  131          * procnum, we set the fh.  If there's a concept of offset
  132          * that we're interested in, we set that.
  133          */
  134         i->fh = ++random_fh;
  135         i->offset = 0;
  136         i->locktype = LK_EXCLUSIVE;
  137         i->read = i->write = 0;
  138 
  139         /*
  140          * Extract the procnum and convert to v3 form if necessary,
  141          * taking care to deal with out-of-range procnums.  Caller will
  142          * ensure that rq_vers is either 2 or 3.
  143          */
  144         procnum = req->rq_proc;
  145         if (!v3) {
  146                 rpcproc_t tmp_procnum;
  147 
  148                 tmp_procnum = cb->get_procnum(procnum);
  149                 if (tmp_procnum == -1)
  150                         goto out;
  151                 procnum = tmp_procnum;
  152         }
  153 
  154         /*
  155          * We do affinity for most.  However, we divide a realm of affinity
  156          * by file offset so as to allow for concurrent random access.  We
  157          * only do this for reads today, but this may change when IFS supports
  158          * efficient concurrent writes.
  159          */
  160         if (cb->no_offset(procnum))
  161                 goto out;
  162 
  163         i->read = cb->is_read(procnum);
  164         i->write = cb->is_write(procnum);
  165 
  166         error = cb->realign(&req->rq_args, M_NOWAIT);
  167         if (error)
  168                 goto out;
  169         md = req->rq_args;
  170         dpos = mtod(md, caddr_t);
  171 
  172         /* Grab the filehandle. */
  173         error = cb->get_fh(&i->fh, v3, &md, &dpos);
  174         if (error)
  175                 goto out;
  176 
  177         /* Content ourselves with zero offset for all but reads. */
  178         if (i->read || i->write)
  179                 cb->get_offset(&md, &dpos, v3, i);
  180 
  181 out:
  182         cb->set_locktype(procnum, i);
  183 }
  184 
  185 static struct fha_hash_entry *
  186 fha_hash_entry_new(u_int64_t fh)
  187 {
  188         struct fha_hash_entry *e;
  189 
  190         e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
  191         e->fh = fh;
  192         e->num_rw = 0;
  193         e->num_exclusive = 0;
  194         e->num_threads = 0;
  195         LIST_INIT(&e->threads);
  196 
  197         return (e);
  198 }
  199 
  200 static void
  201 fha_hash_entry_destroy(struct fha_hash_entry *e)
  202 {
  203 
  204         mtx_assert(e->mtx, MA_OWNED);
  205         KASSERT(e->num_rw == 0,
  206             ("%d reqs on destroyed fhe %p", e->num_rw, e));
  207         KASSERT(e->num_exclusive == 0,
  208             ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
  209         KASSERT(e->num_threads == 0,
  210             ("%d threads on destroyed fhe %p", e->num_threads, e));
  211         free(e, M_NFS_FHA);
  212 }
  213 
  214 static void
  215 fha_hash_entry_remove(struct fha_hash_entry *e)
  216 {
  217 
  218         mtx_assert(e->mtx, MA_OWNED);
  219         LIST_REMOVE(e, link);
  220         fha_hash_entry_destroy(e);
  221 }
  222 
  223 static struct fha_hash_entry *
  224 fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
  225 {
  226         SVCPOOL *pool;
  227         struct fha_hash_slot *fhs;
  228         struct fha_hash_entry *fhe, *new_fhe;
  229 
  230         pool = *softc->pool;
  231         fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
  232         new_fhe = fha_hash_entry_new(fh);
  233         new_fhe->mtx = &fhs->mtx;
  234         mtx_lock(&fhs->mtx);
  235         LIST_FOREACH(fhe, &fhs->list, link)
  236                 if (fhe->fh == fh)
  237                         break;
  238         if (!fhe) {
  239                 fhe = new_fhe;
  240                 LIST_INSERT_HEAD(&fhs->list, fhe, link);
  241         } else
  242                 fha_hash_entry_destroy(new_fhe);
  243         return (fhe);
  244 }
  245 
  246 static void
  247 fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  248 {
  249 
  250         mtx_assert(fhe->mtx, MA_OWNED);
  251         thread->st_p2 = 0;
  252         LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
  253         fhe->num_threads++;
  254 }
  255 
  256 static void
  257 fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  258 {
  259 
  260         mtx_assert(fhe->mtx, MA_OWNED);
  261         KASSERT(thread->st_p2 == 0,
  262             ("%d reqs on removed thread %p", thread->st_p2, thread));
  263         LIST_REMOVE(thread, st_alink);
  264         fhe->num_threads--;
  265 }
  266 
  267 /*
  268  * Account for an ongoing operation associated with this file.
  269  */
  270 static void
  271 fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
  272 {
  273 
  274         mtx_assert(fhe->mtx, MA_OWNED);
  275         if (LK_EXCLUSIVE == locktype)
  276                 fhe->num_exclusive += count;
  277         else
  278                 fhe->num_rw += count;
  279 }
  280 
  281 /*
  282  * Get the service thread currently associated with the fhe that is
  283  * appropriate to handle this operation.
  284  */
  285 static SVCTHREAD *
  286 fha_hash_entry_choose_thread(struct fha_params *softc,
  287     struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
  288 {
  289         SVCTHREAD *thread, *min_thread = NULL;
  290         SVCPOOL *pool;
  291         int req_count, min_count = 0;
  292         off_t offset1, offset2;
  293 
  294         pool = *softc->pool;
  295 
  296         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  297                 req_count = thread->st_p2;
  298 
  299                 /* If there are any writes in progress, use the first thread. */
  300                 if (fhe->num_exclusive) {
  301 #if 0
  302                         ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  303                             "fha: %p(%d)w", thread, req_count);
  304 #endif
  305                         return (thread);
  306                 }
  307 
  308                 /* Check whether we should consider locality. */
  309                 if ((i->read && !softc->ctls.read) ||
  310                     (i->write && !softc->ctls.write))
  311                         goto noloc;
  312 
  313                 /*
  314                  * Check for locality, making sure that we won't
  315                  * exceed our per-thread load limit in the process.
  316                  */
  317                 offset1 = i->offset;
  318                 offset2 = thread->st_p3;
  319 
  320                 if (((offset1 >= offset2)
  321                   && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
  322                  || ((offset2 > offset1)
  323                   && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
  324                         if ((softc->ctls.max_reqs_per_nfsd == 0) ||
  325                             (req_count < softc->ctls.max_reqs_per_nfsd)) {
  326 #if 0
  327                                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  328                                     "fha: %p(%d)r", thread, req_count);
  329 #endif
  330                                 return (thread);
  331                         }
  332                 }
  333 
  334 noloc:
  335                 /*
  336                  * We don't have a locality match, so skip this thread,
  337                  * but keep track of the most attractive thread in case
  338                  * we need to come back to it later.
  339                  */
  340 #if 0
  341                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  342                     "fha: %p(%d)s off1 %llu off2 %llu", thread,
  343                     req_count, offset1, offset2);
  344 #endif
  345                 if ((min_thread == NULL) || (req_count < min_count)) {
  346                         min_count = req_count;
  347                         min_thread = thread;
  348                 }
  349         }
  350 
  351         /*
  352          * We didn't find a good match yet.  See if we can add
  353          * a new thread to this file handle entry's thread list.
  354          */
  355         if ((softc->ctls.max_nfsds_per_fh == 0) ||
  356             (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
  357                 thread = this_thread;
  358 #if 0
  359                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  360                     "fha: %p(%d)t", thread, thread->st_p2);
  361 #endif
  362                 fha_hash_entry_add_thread(fhe, thread);
  363         } else {
  364                 /*
  365                  * We don't want to use any more threads for this file, so
  366                  * go back to the most attractive nfsd we're already using.
  367                  */
  368                 thread = min_thread;
  369         }
  370 
  371         return (thread);
  372 }
  373 
  374 /*
  375  * After getting a request, try to assign it to some thread.  Usually we
  376  * handle it ourselves.
  377  */
  378 SVCTHREAD *
  379 fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
  380     struct fha_params *softc)
  381 {
  382         SVCTHREAD *thread;
  383         struct fha_info i;
  384         struct fha_hash_entry *fhe;
  385         struct fha_callbacks *cb;
  386 
  387         cb = &softc->callbacks;
  388 
  389         /* Check to see whether we're enabled. */
  390         if (softc->ctls.enable == 0)
  391                 goto thist;
  392 
  393         /*
  394          * Only do placement if this is an NFS request.
  395          */
  396         if (req->rq_prog != NFS_PROG)
  397                 goto thist;
  398 
  399         if (req->rq_vers != 2 && req->rq_vers != 3)
  400                 goto thist;
  401 
  402         fha_extract_info(req, &i, cb);
  403 
  404         /*
  405          * We save the offset associated with this request for later
  406          * nfsd matching.
  407          */
  408         fhe = fha_hash_entry_lookup(softc, i.fh);
  409         req->rq_p1 = fhe;
  410         req->rq_p2 = i.locktype;
  411         req->rq_p3 = i.offset;
  412 
  413         /*
  414          * Choose a thread, taking into consideration locality, thread load,
  415          * and the number of threads already working on this file.
  416          */
  417         thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
  418         KASSERT(thread, ("fha_assign: NULL thread!"));
  419         fha_hash_entry_add_op(fhe, i.locktype, 1);
  420         thread->st_p2++;
  421         thread->st_p3 = i.offset;
  422 
  423         /*
  424          * Grab the pool lock here to not let chosen thread go away before
  425          * the new request inserted to its queue while we drop fhe lock.
  426          */
  427         mtx_lock(&thread->st_lock);
  428         mtx_unlock(fhe->mtx);
  429 
  430         return (thread);
  431 thist:
  432         req->rq_p1 = NULL;
  433         mtx_lock(&this_thread->st_lock);
  434         return (this_thread);
  435 }
  436 
  437 /*
  438  * Called when we're done with an operation.  The request has already
  439  * been de-queued.
  440  */
  441 void
  442 fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
  443 {
  444         struct fha_hash_entry *fhe = req->rq_p1;
  445         struct mtx *mtx;
  446 
  447         /*
  448          * This may be called for reqs that didn't go through
  449          * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
  450          */
  451         if (!fhe)
  452                 return;
  453 
  454         mtx = fhe->mtx;
  455         mtx_lock(mtx);
  456         fha_hash_entry_add_op(fhe, req->rq_p2, -1);
  457         thread->st_p2--;
  458         KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
  459             thread->st_p2, thread));
  460         if (thread->st_p2 == 0) {
  461                 fha_hash_entry_remove_thread(fhe, thread);
  462                 if (0 == fhe->num_rw + fhe->num_exclusive)
  463                         fha_hash_entry_remove(fhe);
  464         }
  465         mtx_unlock(mtx);
  466 }
  467 
  468 int
  469 fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
  470 {
  471         int error, i;
  472         struct sbuf sb;
  473         struct fha_hash_entry *fhe;
  474         bool_t first, hfirst;
  475         SVCTHREAD *thread;
  476         SVCPOOL *pool;
  477 
  478         sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);
  479 
  480         pool = NULL;
  481 
  482         if (!*softc->pool) {
  483                 sbuf_printf(&sb, "NFSD not running\n");
  484                 goto out;
  485         }
  486         pool = *softc->pool;
  487 
  488         for (i = 0; i < FHA_HASH_SIZE; i++)
  489                 if (!LIST_EMPTY(&softc->fha_hash[i].list))
  490                         break;
  491 
  492         if (i == FHA_HASH_SIZE) {
  493                 sbuf_printf(&sb, "No file handle entries.\n");
  494                 goto out;
  495         }
  496 
  497         hfirst = TRUE;
  498         for (; i < FHA_HASH_SIZE; i++) {
  499                 mtx_lock(&softc->fha_hash[i].mtx);
  500                 if (LIST_EMPTY(&softc->fha_hash[i].list)) {
  501                         mtx_unlock(&softc->fha_hash[i].mtx);
  502                         continue;
  503                 }
  504                 sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
  505                 first = TRUE;
  506                 LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
  507                         sbuf_printf(&sb, "%sfhe %p: {\n", first ? "  " : ", ", fhe);
  508 
  509                         sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
  510                         sbuf_printf(&sb, "    num_rw/exclusive: %d/%d\n",
  511                             fhe->num_rw, fhe->num_exclusive);
  512                         sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
  513 
  514                         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  515                                 sbuf_printf(&sb, "      thread %p offset %ju "
  516                                     "reqs %d\n", thread,
  517                                     thread->st_p3, thread->st_p2);
  518                         }
  519 
  520                         sbuf_printf(&sb, "  }");
  521                         first = FALSE;
  522                 }
  523                 sbuf_printf(&sb, "\n}");
  524                 mtx_unlock(&softc->fha_hash[i].mtx);
  525                 hfirst = FALSE;
  526         }
  527 
  528  out:
  529         sbuf_trim(&sb);
  530         sbuf_finish(&sb);
  531         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
  532         sbuf_delete(&sb);
  533         return (error);
  534 }

Cache object: 6a805ffa75a4074c4fb97b88d7a567d3


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.