The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/nfs/nfs_fha.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
    3  *
    4  * Redistribution and use in source and binary forms, with or without
    5  * modification, are permitted provided that the following conditions
    6  * are met:
    7  * 1. Redistributions of source code must retain the above copyright
    8  *    notice, this list of conditions and the following disclaimer.
    9  * 2. Redistributions in binary form must reproduce the above copyright
   10  *    notice, this list of conditions and the following disclaimer in the
   11  *    documentation and/or other materials provided with the distribution.
   12  *
   13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   23  * SUCH DAMAGE.
   24  */
   25 
   26 #include <sys/cdefs.h>
   27 __FBSDID("$FreeBSD: releng/9.2/sys/nfs/nfs_fha.c 251641 2013-06-11 23:19:02Z ken $");
   28 
   29 #include <sys/param.h>
   30 #include <sys/systm.h>
   31 #include <sys/sysproto.h>
   32 #include <sys/kernel.h>
   33 #include <sys/sysctl.h>
   34 #include <sys/vnode.h>
   35 #include <sys/malloc.h>
   36 #include <sys/mount.h>
   37 #include <sys/mbuf.h>
   38 #include <sys/sbuf.h>
   39 
   40 #include <rpc/rpc.h>
   41 #include <nfs/nfs_fha.h>
   42 
   43 static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
   44 
   45 /*
   46  * XXX need to commonize definitions between old and new NFS code.  Define
   47  * this here so we don't include one nfsproto.h over the other.
   48  */
   49 #define NFS_PROG                100003
   50 
   51 void
   52 fha_init(struct fha_params *softc)
   53 {
   54         char tmpstr[128];
   55 
   56         /*
   57          * A small hash table to map filehandles to fha_hash_entry
   58          * structures.
   59          */
   60         softc->g_fha.hashtable = hashinit(256, M_NFS_FHA,
   61             &softc->g_fha.hashmask);
   62 
   63         /*
   64          * Set the default tuning parameters.
   65          */
   66         softc->ctls.enable = FHA_DEF_ENABLE;
   67         softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
   68         softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
   69         softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
   70 
   71         /*
   72          * Allow the user to override the defaults at boot time with
   73          * tunables.
   74          */
   75         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.enable",
   76             softc->server_name);
   77         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.enable);
   78         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.bin_shift",
   79             softc->server_name);
   80         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.bin_shift);
   81         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_nfsds_per_fh",
   82             softc->server_name);
   83         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_nfsds_per_fh);
   84         snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_reqs_per_nfsd",
   85             softc->server_name);
   86         TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_reqs_per_nfsd);
   87 
   88         /*
   89          * Add sysctls so the user can change the tuning parameters at
   90          * runtime.
   91          */
   92         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   93             OID_AUTO, "enable", CTLFLAG_RW,
   94             &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
   95 
   96         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
   97             OID_AUTO, "bin_shift", CTLFLAG_RW,
   98             &softc->ctls.bin_shift, 0, "For FHA reads, no two requests will "
   99             "contend if they're 2^(bin_shift) bytes apart");
  100 
  101         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
  102             OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RW,
  103             &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
  104             "should be working on requests for the same file handle");
  105 
  106         SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
  107             OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RW,
  108             &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
  109             "single nfsd thread should be working on at any time");
  110 
  111         SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
  112             OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
  113             softc->callbacks.fhe_stats_sysctl, "A", "");
  114 
  115 }
  116 
  117 void
  118 fha_uninit(struct fha_params *softc)
  119 {
  120         sysctl_ctx_free(&softc->sysctl_ctx);
  121         hashdestroy(softc->g_fha.hashtable, M_NFS_FHA, softc->g_fha.hashmask);
  122 }
  123 
  124 /*
  125  * This just specifies that offsets should obey affinity when within
  126  * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
  127  */
  128 static void
  129 fha_extract_info(struct svc_req *req, struct fha_info *i,
  130     struct fha_callbacks *cb)
  131 {
  132         struct mbuf *md;
  133         fhandle_t fh;
  134         caddr_t dpos;
  135         static u_int64_t random_fh = 0;
  136         int error;
  137         int v3 = (req->rq_vers == 3);
  138         rpcproc_t procnum;
  139 
  140         /*
  141          * We start off with a random fh.  If we get a reasonable
  142          * procnum, we set the fh.  If there's a concept of offset
  143          * that we're interested in, we set that.
  144          */
  145         i->fh = ++random_fh;
  146         i->offset = 0;
  147         i->locktype = LK_EXCLUSIVE;
  148 
  149         /*
  150          * Extract the procnum and convert to v3 form if necessary,
  151          * taking care to deal with out-of-range procnums.  Caller will
  152          * ensure that rq_vers is either 2 or 3.
  153          */
  154         procnum = req->rq_proc;
  155         if (!v3) {
  156                 rpcproc_t tmp_procnum;
  157 
  158                 tmp_procnum = cb->get_procnum(procnum);
  159                 if (tmp_procnum == -1)
  160                         goto out;
  161                 procnum = tmp_procnum;
  162         }
  163 
  164         /*
  165          * We do affinity for most.  However, we divide a realm of affinity
  166          * by file offset so as to allow for concurrent random access.  We
  167          * only do this for reads today, but this may change when IFS supports
  168          * efficient concurrent writes.
  169          */
  170         if (cb->no_offset(procnum))
  171                 goto out;
  172 
  173         error = cb->realign(&req->rq_args, M_NOWAIT);
  174         if (error)
  175                 goto out;
  176         md = req->rq_args;
  177         dpos = mtod(md, caddr_t);
  178 
  179         /* Grab the filehandle. */
  180         error = cb->get_fh(&fh, v3, &md, &dpos);
  181         if (error)
  182                 goto out;
  183 
  184         bcopy(fh.fh_fid.fid_data, &i->fh, sizeof(i->fh));
  185 
  186         /* Content ourselves with zero offset for all but reads. */
  187         if (cb->is_read(procnum) || cb->is_write(procnum))
  188                 cb->get_offset(&md, &dpos, v3, i);
  189 
  190 out:
  191         cb->set_locktype(procnum, i);
  192 }
  193 
  194 static struct fha_hash_entry *
  195 fha_hash_entry_new(u_int64_t fh)
  196 {
  197         struct fha_hash_entry *e;
  198 
  199         e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
  200         e->fh = fh;
  201         e->num_rw = 0;
  202         e->num_exclusive = 0;
  203         e->num_threads = 0;
  204         LIST_INIT(&e->threads);
  205 
  206         return (e);
  207 }
  208 
  209 static void
  210 fha_hash_entry_destroy(struct fha_hash_entry *e)
  211 {
  212 
  213         if (e->num_rw + e->num_exclusive)
  214                 panic("nonempty fhe");
  215         free(e, M_NFS_FHA);
  216 }
  217 
  218 static void
  219 fha_hash_entry_remove(struct fha_hash_entry *e)
  220 {
  221 
  222         LIST_REMOVE(e, link);
  223         fha_hash_entry_destroy(e);
  224 }
  225 
  226 static struct fha_hash_entry *
  227 fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
  228 {
  229         SVCPOOL *pool;
  230 
  231         pool = *softc->pool;
  232 
  233         struct fha_hash_entry *fhe, *new_fhe;
  234 
  235         LIST_FOREACH(fhe, &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
  236             link)
  237                 if (fhe->fh == fh)
  238                         break;
  239 
  240         if (!fhe) {
  241                 /* Allocate a new entry. */
  242                 mtx_unlock(&pool->sp_lock);
  243                 new_fhe = fha_hash_entry_new(fh);
  244                 mtx_lock(&pool->sp_lock);
  245 
  246                 /* Double-check to make sure we still need the new entry. */
  247                 LIST_FOREACH(fhe,
  248                     &softc->g_fha.hashtable[fh % softc->g_fha.hashmask], link)
  249                         if (fhe->fh == fh)
  250                                 break;
  251                 if (!fhe) {
  252                         fhe = new_fhe;
  253                         LIST_INSERT_HEAD(
  254                             &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
  255                             fhe, link);
  256                 } else
  257                         fha_hash_entry_destroy(new_fhe);
  258         }
  259 
  260         return (fhe);
  261 }
  262 
  263 static void
  264 fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  265 {
  266 
  267         LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
  268         fhe->num_threads++;
  269 }
  270 
  271 static void
  272 fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
  273 {
  274 
  275         LIST_REMOVE(thread, st_alink);
  276         fhe->num_threads--;
  277 }
  278 
  279 /*
  280  * Account for an ongoing operation associated with this file.
  281  */
  282 static void
  283 fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
  284 {
  285 
  286         if (LK_EXCLUSIVE == locktype)
  287                 fhe->num_exclusive += count;
  288         else
  289                 fhe->num_rw += count;
  290 }
  291 
  292 static SVCTHREAD *
  293 get_idle_thread(SVCPOOL *pool)
  294 {
  295         SVCTHREAD *st;
  296 
  297         LIST_FOREACH(st, &pool->sp_idlethreads, st_ilink) {
  298                 if (st->st_xprt == NULL && STAILQ_EMPTY(&st->st_reqs))
  299                         return (st);
  300         }
  301         return (NULL);
  302 }
  303 
  304 
  305 /*
  306  * Get the service thread currently associated with the fhe that is
  307  * appropriate to handle this operation.
  308  */
  309 SVCTHREAD *
  310 fha_hash_entry_choose_thread(struct fha_params *softc,
  311     struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread);
  312 
  313 SVCTHREAD *
  314 fha_hash_entry_choose_thread(struct fha_params *softc,
  315     struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
  316 {
  317         SVCTHREAD *thread, *min_thread = NULL;
  318         SVCPOOL *pool;
  319         int req_count, min_count = 0;
  320         off_t offset1, offset2;
  321 
  322         pool = *softc->pool;
  323 
  324         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  325                 req_count = thread->st_reqcount;
  326 
  327                 /* If there are any writes in progress, use the first thread. */
  328                 if (fhe->num_exclusive) {
  329 #if 0
  330                         ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  331                             "fha: %p(%d)w", thread, req_count);
  332 #endif
  333                         return (thread);
  334                 }
  335 
  336                 /*
  337                  * Check for read locality, making sure that we won't
  338                  * exceed our per-thread load limit in the process.
  339                  */
  340                 offset1 = i->offset;
  341                 offset2 = STAILQ_FIRST(&thread->st_reqs)->rq_p3;
  342 
  343                 if (((offset1 >= offset2)
  344                   && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
  345                  || ((offset2 > offset1)
  346                   && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
  347                         if ((softc->ctls.max_reqs_per_nfsd == 0) ||
  348                             (req_count < softc->ctls.max_reqs_per_nfsd)) {
  349 #if 0
  350                                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  351                                     "fha: %p(%d)r", thread, req_count);
  352 #endif
  353                                 return (thread);
  354                         }
  355                 }
  356 
  357                 /*
  358                  * We don't have a locality match, so skip this thread,
  359                  * but keep track of the most attractive thread in case
  360                  * we need to come back to it later.
  361                  */
  362 #if 0
  363                 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  364                     "fha: %p(%d)s off1 %llu off2 %llu", thread,
  365                     req_count, offset1, offset2);
  366 #endif
  367                 if ((min_thread == NULL) || (req_count < min_count)) {
  368                         min_count = req_count;
  369                         min_thread = thread;
  370                 }
  371         }
  372 
  373         /*
  374          * We didn't find a good match yet.  See if we can add
  375          * a new thread to this file handle entry's thread list.
  376          */
  377         if ((softc->ctls.max_nfsds_per_fh == 0) ||
  378             (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
  379                 /*
  380                  * We can add a new thread, so try for an idle thread
  381                  * first, and fall back to this_thread if none are idle.
  382                  */
  383                 if (STAILQ_EMPTY(&this_thread->st_reqs)) {
  384                         thread = this_thread;
  385 #if 0
  386                         ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  387                             "fha: %p(%d)t", thread, thread->st_reqcount);
  388 #endif
  389                 } else if ((thread = get_idle_thread(pool))) {
  390 #if 0
  391                         ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  392                             "fha: %p(%d)i", thread, thread->st_reqcount);
  393 #endif
  394                 } else {
  395                         thread = this_thread;
  396 #if 0
  397                         ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
  398                             "fha: %p(%d)b", thread, thread->st_reqcount);
  399 #endif
  400                 }
  401                 fha_hash_entry_add_thread(fhe, thread);
  402         } else {
  403                 /*
  404                  * We don't want to use any more threads for this file, so
  405                  * go back to the most attractive nfsd we're already using.
  406                  */
  407                 thread = min_thread;
  408         }
  409 
  410         return (thread);
  411 }
  412 
  413 /*
  414  * After getting a request, try to assign it to some thread.  Usually we
  415  * handle it ourselves.
  416  */
  417 SVCTHREAD *
  418 fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
  419     struct fha_params *softc)
  420 {
  421         SVCPOOL *pool;
  422         SVCTHREAD *thread;
  423         struct fha_info i;
  424         struct fha_hash_entry *fhe;
  425         struct fha_callbacks *cb;
  426 
  427         cb = &softc->callbacks;
  428 
  429         /* Check to see whether we're enabled. */
  430         if (softc->ctls.enable == 0)
  431                 return (this_thread);
  432 
  433         /*
  434          * Only do placement if this is an NFS request.
  435          */
  436         if (req->rq_prog != NFS_PROG)
  437                 return (this_thread);
  438 
  439         if (req->rq_vers != 2 && req->rq_vers != 3)
  440                 return (this_thread);
  441 
  442         pool = req->rq_xprt->xp_pool;
  443         fha_extract_info(req, &i, cb);
  444 
  445         /*
  446          * We save the offset associated with this request for later
  447          * nfsd matching.
  448          */
  449         fhe = fha_hash_entry_lookup(softc, i.fh);
  450         req->rq_p1 = fhe;
  451         req->rq_p2 = i.locktype;
  452         req->rq_p3 = i.offset;
  453 
  454         /*
  455          * Choose a thread, taking into consideration locality, thread load,
  456          * and the number of threads already working on this file.
  457          */
  458         thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
  459         KASSERT(thread, ("fha_assign: NULL thread!"));
  460         fha_hash_entry_add_op(fhe, i.locktype, 1);
  461 
  462         return (thread);
  463 }
  464 
  465 /*
  466  * Called when we're done with an operation.  The request has already
  467  * been de-queued.
  468  */
  469 void
  470 fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
  471 {
  472         struct fha_hash_entry *fhe = req->rq_p1;
  473 
  474         /*
  475          * This may be called for reqs that didn't go through
  476          * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
  477          */
  478         if (!fhe)
  479                 return;
  480 
  481         fha_hash_entry_add_op(fhe, req->rq_p2, -1);
  482 
  483         if (thread->st_reqcount == 0) {
  484                 fha_hash_entry_remove_thread(fhe, thread);
  485                 if (0 == fhe->num_rw + fhe->num_exclusive)
  486                         fha_hash_entry_remove(fhe);
  487         }
  488 }
  489 
  490 int
  491 fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
  492 {
  493         int error, count, i;
  494         struct sbuf sb;
  495         struct fha_hash_entry *fhe;
  496         bool_t first = TRUE;
  497         SVCTHREAD *thread;
  498         SVCPOOL *pool;
  499 
  500         sbuf_new(&sb, NULL, 4096, SBUF_FIXEDLEN);
  501 
  502         pool = NULL;
  503 
  504         if (!*softc->pool) {
  505                 sbuf_printf(&sb, "NFSD not running\n");
  506                 goto out;
  507         }
  508         pool = *softc->pool;
  509 
  510         mtx_lock(&pool->sp_lock);
  511         count = 0;
  512         for (i = 0; i <= softc->g_fha.hashmask; i++)
  513                 if (!LIST_EMPTY(&softc->g_fha.hashtable[i]))
  514                         count++;
  515 
  516         if (count == 0) {
  517                 sbuf_printf(&sb, "No file handle entries.\n");
  518                 goto out;
  519         }
  520 
  521         for (i = 0; i <= softc->g_fha.hashmask; i++) {
  522                 LIST_FOREACH(fhe, &softc->g_fha.hashtable[i], link) {
  523                         sbuf_printf(&sb, "%sfhe %p: {\n", first ? "" : ", ", fhe);
  524 
  525                         sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
  526                         sbuf_printf(&sb, "    num_rw: %d\n", fhe->num_rw);
  527                         sbuf_printf(&sb, "    num_exclusive: %d\n", fhe->num_exclusive);
  528                         sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
  529 
  530                         LIST_FOREACH(thread, &fhe->threads, st_alink) {
  531                                 sbuf_printf(&sb, "    thread %p offset %ju "
  532                                     "(count %d)\n", thread,
  533                                     STAILQ_FIRST(&thread->st_reqs)->rq_p3,
  534                                     thread->st_reqcount);
  535                         }
  536 
  537                         sbuf_printf(&sb, "}");
  538                         first = FALSE;
  539 
  540                         /* Limit the output. */
  541                         if (++count > 128) {
  542                                 sbuf_printf(&sb, "...");
  543                                 break;
  544                         }
  545                 }
  546         }
  547 
  548  out:
  549         if (pool)
  550                 mtx_unlock(&pool->sp_lock);
  551         sbuf_trim(&sb);
  552         sbuf_finish(&sb);
  553         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
  554         sbuf_delete(&sb);
  555         return (error);
  556 }

Cache object: f1d5b248fe2b6c4112bc45fa7961fbea


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.