The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/raidframe/rf_dagutils.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: rf_dagutils.c,v 1.43.2.1 2004/04/11 11:19:16 tron Exp $        */
    2 /*
    3  * Copyright (c) 1995 Carnegie-Mellon University.
    4  * All rights reserved.
    5  *
    6  * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
    7  *
    8  * Permission to use, copy, modify and distribute this software and
    9  * its documentation is hereby granted, provided that both the copyright
   10  * notice and this permission notice appear in all copies of the
   11  * software, derivative works or modified versions, and any portions
   12  * thereof, and that both notices appear in supporting documentation.
   13  *
   14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   17  *
   18  * Carnegie Mellon requests users of this software to return to
   19  *
   20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   21  *  School of Computer Science
   22  *  Carnegie Mellon University
   23  *  Pittsburgh PA 15213-3890
   24  *
   25  * any improvements or extensions that they make and grant Carnegie the
   26  * rights to redistribute these changes.
   27  */
   28 
   29 /******************************************************************************
   30  *
   31  * rf_dagutils.c -- utility routines for manipulating dags
   32  *
   33  *****************************************************************************/
   34 
   35 #include <sys/cdefs.h>
   36 __KERNEL_RCSID(0, "$NetBSD: rf_dagutils.c,v 1.43.2.1 2004/04/11 11:19:16 tron Exp $");
   37 
   38 #include <dev/raidframe/raidframevar.h>
   39 
   40 #include "rf_archs.h"
   41 #include "rf_threadstuff.h"
   42 #include "rf_raid.h"
   43 #include "rf_dag.h"
   44 #include "rf_dagutils.h"
   45 #include "rf_dagfuncs.h"
   46 #include "rf_general.h"
   47 #include "rf_map.h"
   48 #include "rf_shutdown.h"
   49 
   50 #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
   51 
   52 const RF_RedFuncs_t rf_xorFuncs = {
   53         rf_RegularXorFunc, "Reg Xr",
   54         rf_SimpleXorFunc, "Simple Xr"};
   55 
   56 const RF_RedFuncs_t rf_xorRecoveryFuncs = {
   57         rf_RecoveryXorFunc, "Recovery Xr",
   58         rf_RecoveryXorFunc, "Recovery Xr"};
   59 
   60 #if RF_DEBUG_VALIDATE_DAG
   61 static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
   62 static void rf_PrintDAG(RF_DagHeader_t *);
   63 static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
   64                              RF_DagNode_t **, int);
   65 static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
   66 static void rf_ValidateVisitedBits(RF_DagHeader_t *);
   67 #endif /* RF_DEBUG_VALIDATE_DAG */
   68 
   69 /* The maximum number of nodes in a DAG is bounded by
   70 
   71 (2 * raidPtr->Layout->numDataCol) + (1 * layoutPtr->numParityCol) + 
   72         (1 * 2 * layoutPtr->numParityCol) + 3
   73 
   74 which is:  2*RF_MAXCOL+1*2+1*2*2+3
   75 
   76 For RF_MAXCOL of 40, this works out to 89.  We use this value to provide an estimate
   77 on the maximum size needed for RF_DAGPCACHE_SIZE.  For RF_MAXCOL of 40, this structure 
   78 would be 534 bytes.  Too much to have on-hand in a RF_DagNode_t, but should be ok to 
   79 have a few kicking around.
   80 */
   81 #define RF_DAGPCACHE_SIZE ((2*RF_MAXCOL+1*2+1*2*2+3) *(RF_MAX(sizeof(RF_DagParam_t), sizeof(RF_DagNode_t *))))
   82 
   83 
   84 /******************************************************************************
   85  *
   86  * InitNode - initialize a dag node
   87  *
   88  * the size of the propList array is always the same as that of the
   89  * successors array.
   90  *
   91  *****************************************************************************/
   92 void
   93 rf_InitNode(RF_DagNode_t *node, RF_NodeStatus_t initstatus, int commit,
   94     int (*doFunc) (RF_DagNode_t *node),
   95     int (*undoFunc) (RF_DagNode_t *node),
   96     int (*wakeFunc) (RF_DagNode_t *node, int status),
   97     int nSucc, int nAnte, int nParam, int nResult,
   98     RF_DagHeader_t *hdr, char *name, RF_AllocListElem_t *alist)
   99 {
  100         void  **ptrs;
  101         int     nptrs;
  102 
  103         if (nAnte > RF_MAX_ANTECEDENTS)
  104                 RF_PANIC();
  105         node->status = initstatus;
  106         node->commitNode = commit;
  107         node->doFunc = doFunc;
  108         node->undoFunc = undoFunc;
  109         node->wakeFunc = wakeFunc;
  110         node->numParams = nParam;
  111         node->numResults = nResult;
  112         node->numAntecedents = nAnte;
  113         node->numAntDone = 0;
  114         node->next = NULL;
  115         /* node->list_next = NULL */  /* Don't touch this here!  
  116                                          It may already be 
  117                                          in use by the caller! */
  118         node->numSuccedents = nSucc;
  119         node->name = name;
  120         node->dagHdr = hdr;
  121         node->big_dag_ptrs = NULL;
  122         node->big_dag_params = NULL;
  123         node->visited = 0;
  124 
  125         /* allocate all the pointers with one call to malloc */
  126         nptrs = nSucc + nAnte + nResult + nSucc;
  127 
  128         if (nptrs <= RF_DAG_PTRCACHESIZE) {
  129                 /*
  130                  * The dag_ptrs field of the node is basically some scribble
  131                  * space to be used here. We could get rid of it, and always
  132                  * allocate the range of pointers, but that's expensive. So,
  133                  * we pick a "common case" size for the pointer cache. Hopefully,
  134                  * we'll find that:
  135                  * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
  136                  *     only a little bit (least efficient case)
  137                  * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
  138                  *     (wasted memory)
  139                  */
  140                 ptrs = (void **) node->dag_ptrs;
  141         } else if (nptrs <= (RF_DAGPCACHE_SIZE / sizeof(RF_DagNode_t *))) {
  142                 node->big_dag_ptrs = rf_AllocDAGPCache();
  143                 ptrs = (void **) node->big_dag_ptrs;
  144         } else {
  145                 RF_MallocAndAdd(ptrs, nptrs * sizeof(void *), 
  146                                 (void **), alist);
  147         }
  148         node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
  149         node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs + nSucc) : NULL;
  150         node->results = (nResult) ? (void **) (ptrs + nSucc + nAnte) : NULL;
  151         node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs + nSucc + nAnte + nResult) : NULL;
  152 
  153         if (nParam) {
  154                 if (nParam <= RF_DAG_PARAMCACHESIZE) {
  155                         node->params = (RF_DagParam_t *) node->dag_params;
  156                 } else if (nParam <= (RF_DAGPCACHE_SIZE / sizeof(RF_DagParam_t))) {
  157                         node->big_dag_params = rf_AllocDAGPCache();
  158                         node->params = node->big_dag_params;
  159                 } else {
  160                         RF_MallocAndAdd(node->params, 
  161                                         nParam * sizeof(RF_DagParam_t), 
  162                                         (RF_DagParam_t *), alist);
  163                 }
  164         } else {
  165                 node->params = NULL;
  166         }
  167 }
  168 
  169 
  170 
  171 /******************************************************************************
  172  *
  173  * allocation and deallocation routines
  174  *
  175  *****************************************************************************/
  176 
  177 void 
  178 rf_FreeDAG(RF_DagHeader_t *dag_h)
  179 {
  180         RF_AccessStripeMapHeader_t *asmap, *t_asmap;
  181         RF_PhysDiskAddr_t *pda;
  182         RF_DagNode_t *tmpnode;
  183         RF_DagHeader_t *nextDag;
  184 
  185         while (dag_h) {
  186                 nextDag = dag_h->next;
  187                 rf_FreeAllocList(dag_h->allocList);
  188                 for (asmap = dag_h->asmList; asmap;) {
  189                         t_asmap = asmap;
  190                         asmap = asmap->next;
  191                         rf_FreeAccessStripeMap(t_asmap);
  192                 }
  193                 while (dag_h->pda_cleanup_list) {
  194                         pda = dag_h->pda_cleanup_list;
  195                         dag_h->pda_cleanup_list = dag_h->pda_cleanup_list->next;
  196                         rf_FreePhysDiskAddr(pda);
  197                 }
  198                 while (dag_h->nodes) {
  199                         tmpnode = dag_h->nodes;
  200                         dag_h->nodes = dag_h->nodes->list_next;
  201                         rf_FreeDAGNode(tmpnode);
  202                 }
  203                 rf_FreeDAGHeader(dag_h);
  204                 dag_h = nextDag;
  205         }
  206 }
  207 
  208 #define RF_MAX_FREE_DAGH 128
  209 #define RF_MIN_FREE_DAGH  32
  210 
  211 #define RF_MAX_FREE_DAGNODE 512 /* XXX Tune this... */
  212 #define RF_MIN_FREE_DAGNODE 128 /* XXX Tune this... */
  213 
  214 #define RF_MAX_FREE_DAGLIST 128
  215 #define RF_MIN_FREE_DAGLIST  32
  216 
  217 #define RF_MAX_FREE_DAGPCACHE 128
  218 #define RF_MIN_FREE_DAGPCACHE   8
  219 
  220 #define RF_MAX_FREE_FUNCLIST 128
  221 #define RF_MIN_FREE_FUNCLIST  32
  222 
  223 #define RF_MAX_FREE_BUFFERS 128
  224 #define RF_MIN_FREE_BUFFERS  32
  225 
  226 static void rf_ShutdownDAGs(void *);
  227 static void 
  228 rf_ShutdownDAGs(void *ignored)
  229 {
  230         pool_destroy(&rf_pools.dagh);
  231         pool_destroy(&rf_pools.dagnode);
  232         pool_destroy(&rf_pools.daglist);
  233         pool_destroy(&rf_pools.dagpcache);
  234         pool_destroy(&rf_pools.funclist);
  235 }
  236 
  237 int 
  238 rf_ConfigureDAGs(RF_ShutdownList_t **listp)
  239 {
  240 
  241         rf_pool_init(&rf_pools.dagnode, sizeof(RF_DagNode_t),
  242                      "rf_dagnode_pl", RF_MIN_FREE_DAGNODE, RF_MAX_FREE_DAGNODE);
  243         rf_pool_init(&rf_pools.dagh, sizeof(RF_DagHeader_t),
  244                      "rf_dagh_pl", RF_MIN_FREE_DAGH, RF_MAX_FREE_DAGH);
  245         rf_pool_init(&rf_pools.daglist, sizeof(RF_DagList_t),
  246                      "rf_daglist_pl", RF_MIN_FREE_DAGLIST, RF_MAX_FREE_DAGLIST);
  247         rf_pool_init(&rf_pools.dagpcache, RF_DAGPCACHE_SIZE,
  248                      "rf_dagpcache_pl", RF_MIN_FREE_DAGPCACHE, RF_MAX_FREE_DAGPCACHE);
  249         rf_pool_init(&rf_pools.funclist, sizeof(RF_FuncList_t),
  250                      "rf_funclist_pl", RF_MIN_FREE_FUNCLIST, RF_MAX_FREE_FUNCLIST);
  251         rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
  252 
  253         return (0);
  254 }
  255 
  256 RF_DagHeader_t *
  257 rf_AllocDAGHeader()
  258 {
  259         RF_DagHeader_t *dh;
  260 
  261         dh = pool_get(&rf_pools.dagh, PR_WAITOK);
  262         memset((char *) dh, 0, sizeof(RF_DagHeader_t));
  263         return (dh);
  264 }
  265 
  266 void 
  267 rf_FreeDAGHeader(RF_DagHeader_t * dh)
  268 {
  269         pool_put(&rf_pools.dagh, dh);
  270 }
  271 
  272 RF_DagNode_t *
  273 rf_AllocDAGNode()
  274 {
  275         RF_DagNode_t *node;
  276 
  277         node = pool_get(&rf_pools.dagnode, PR_WAITOK);
  278         memset(node, 0, sizeof(RF_DagNode_t));
  279         return (node);
  280 }
  281 
  282 void
  283 rf_FreeDAGNode(RF_DagNode_t *node)
  284 {
  285         if (node->big_dag_ptrs) {
  286                 rf_FreeDAGPCache(node->big_dag_ptrs);
  287         }
  288         if (node->big_dag_params) {
  289                 rf_FreeDAGPCache(node->big_dag_params);
  290         }
  291         pool_put(&rf_pools.dagnode, node);
  292 }
  293 
  294 RF_DagList_t *
  295 rf_AllocDAGList()
  296 {
  297         RF_DagList_t *dagList;
  298 
  299         dagList = pool_get(&rf_pools.daglist, PR_WAITOK);
  300         memset(dagList, 0, sizeof(RF_DagList_t));
  301 
  302         return (dagList);
  303 }
  304 
  305 void
  306 rf_FreeDAGList(RF_DagList_t *dagList)
  307 {
  308         pool_put(&rf_pools.daglist, dagList);
  309 }
  310 
  311 void *
  312 rf_AllocDAGPCache()
  313 {
  314         void *p;
  315         p = pool_get(&rf_pools.dagpcache, PR_WAITOK);
  316         memset(p, 0, RF_DAGPCACHE_SIZE);
  317         
  318         return (p);
  319 }
  320 
  321 void
  322 rf_FreeDAGPCache(void *p)
  323 {
  324         pool_put(&rf_pools.dagpcache, p);
  325 }
  326 
  327 RF_FuncList_t *
  328 rf_AllocFuncList()
  329 {
  330         RF_FuncList_t *funcList;
  331 
  332         funcList = pool_get(&rf_pools.funclist, PR_WAITOK);
  333         memset(funcList, 0, sizeof(RF_FuncList_t));
  334         
  335         return (funcList);
  336 }
  337 
  338 void
  339 rf_FreeFuncList(RF_FuncList_t *funcList)
  340 {
  341         pool_put(&rf_pools.funclist, funcList);
  342 }
  343 
  344 /* allocates a stripe buffer -- a buffer large enough to hold all the data
  345    in an entire stripe.  
  346 */
  347 
  348 void *
  349 rf_AllocStripeBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h, int size)
  350 {
  351         RF_VoidPointerListElem_t *vple;
  352         void *p;
  353 
  354         RF_ASSERT((size <= (raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << 
  355                                                raidPtr->logBytesPerSector))));
  356 
  357         p =  malloc( raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << 
  358                                         raidPtr->logBytesPerSector), 
  359                      M_RAIDFRAME, M_NOWAIT);
  360         if (!p) {
  361                 RF_LOCK_MUTEX(raidPtr->mutex);
  362                 if (raidPtr->stripebuf_count > 0) {
  363                         vple = raidPtr->stripebuf;
  364                         raidPtr->stripebuf = vple->next;
  365                         p = vple->p;
  366                         rf_FreeVPListElem(vple);
  367                         raidPtr->stripebuf_count--;
  368                 } else {
  369 #ifdef DIAGNOSTIC
  370                         printf("raid%d: Help!  Out of emergency full-stripe buffers!\n", raidPtr->raidid);
  371 #endif
  372                 }
  373                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  374                 if (!p) {
  375                         /* We didn't get a buffer... not much we can do other than wait, 
  376                            and hope that someone frees up memory for us.. */
  377                         p = malloc( raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << 
  378                                                        raidPtr->logBytesPerSector), M_RAIDFRAME, M_WAITOK);
  379                 }
  380         }
  381         memset(p, 0, raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << raidPtr->logBytesPerSector));
  382 
  383         vple = rf_AllocVPListElem();
  384         vple->p = p;
  385         vple->next = dag_h->desc->stripebufs;
  386         dag_h->desc->stripebufs = vple;
  387 
  388         return (p);
  389 }
  390 
  391 
  392 void
  393 rf_FreeStripeBuffer(RF_Raid_t *raidPtr, RF_VoidPointerListElem_t *vple)
  394 {
  395         RF_LOCK_MUTEX(raidPtr->mutex);
  396         if (raidPtr->stripebuf_count < raidPtr->numEmergencyStripeBuffers) {
  397                 /* just tack it in */
  398                 vple->next = raidPtr->stripebuf;
  399                 raidPtr->stripebuf = vple;
  400                 raidPtr->stripebuf_count++;
  401         } else {
  402                 free(vple->p, M_RAIDFRAME);
  403                 rf_FreeVPListElem(vple);
  404         }
  405         RF_UNLOCK_MUTEX(raidPtr->mutex);
  406 }
  407 
  408 /* allocates a buffer big enough to hold the data described by the
  409 caller (ie. the data of the associated PDA).  Glue this buffer
  410 into our dag_h cleanup structure. */
  411 
  412 void *
  413 rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h, int size)
  414 {
  415         RF_VoidPointerListElem_t *vple;
  416         void *p;
  417 
  418         p = rf_AllocIOBuffer(raidPtr, size);
  419         vple = rf_AllocVPListElem();
  420         vple->p = p;
  421         vple->next = dag_h->desc->iobufs;
  422         dag_h->desc->iobufs = vple;
  423 
  424         return (p);
  425 }
  426 
  427 void *
  428 rf_AllocIOBuffer(RF_Raid_t *raidPtr, int size)
  429 {
  430         RF_VoidPointerListElem_t *vple;
  431         void *p;
  432 
  433         RF_ASSERT((size <= (raidPtr->Layout.sectorsPerStripeUnit << 
  434                            raidPtr->logBytesPerSector)));
  435 
  436         p =  malloc( raidPtr->Layout.sectorsPerStripeUnit << 
  437                                  raidPtr->logBytesPerSector, 
  438                                  M_RAIDFRAME, M_NOWAIT);
  439         if (!p) {
  440                 RF_LOCK_MUTEX(raidPtr->mutex);
  441                 if (raidPtr->iobuf_count > 0) {
  442                         vple = raidPtr->iobuf;
  443                         raidPtr->iobuf = vple->next;
  444                         p = vple->p;
  445                         rf_FreeVPListElem(vple);
  446                         raidPtr->iobuf_count--;
  447                 } else {
  448 #ifdef DIAGNOSTIC
  449                         printf("raid%d: Help!  Out of emergency buffers!\n", raidPtr->raidid);
  450 #endif
  451                 }
  452                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  453                 if (!p) {
  454                         /* We didn't get a buffer... not much we can do other than wait, 
  455                            and hope that someone frees up memory for us.. */
  456                         p = malloc( raidPtr->Layout.sectorsPerStripeUnit << 
  457                                     raidPtr->logBytesPerSector, 
  458                                     M_RAIDFRAME, M_WAITOK);
  459                 }
  460         }
  461         memset(p, 0, raidPtr->Layout.sectorsPerStripeUnit << raidPtr->logBytesPerSector);
  462         return (p);
  463 }
  464 
  465 void
  466 rf_FreeIOBuffer(RF_Raid_t *raidPtr, RF_VoidPointerListElem_t *vple)
  467 {
  468         RF_LOCK_MUTEX(raidPtr->mutex);
  469         if (raidPtr->iobuf_count < raidPtr->numEmergencyBuffers) {
  470                 /* just tack it in */
  471                 vple->next = raidPtr->iobuf;
  472                 raidPtr->iobuf = vple;
  473                 raidPtr->iobuf_count++;
  474         } else {
  475                 free(vple->p, M_RAIDFRAME);
  476                 rf_FreeVPListElem(vple);
  477         }
  478         RF_UNLOCK_MUTEX(raidPtr->mutex);
  479 }
  480 
  481 
  482 
  483 #if RF_DEBUG_VALIDATE_DAG
  484 /******************************************************************************
  485  *
  486  * debug routines
  487  *
  488  *****************************************************************************/
  489 
  490 char   *
  491 rf_NodeStatusString(RF_DagNode_t *node)
  492 {
  493         switch (node->status) {
  494         case rf_wait:
  495                 return ("wait");
  496         case rf_fired:
  497                 return ("fired");
  498         case rf_good:
  499                 return ("good");
  500         case rf_bad:
  501                 return ("bad");
  502         default:
  503                 return ("?");
  504         }
  505 }
  506 
  507 void 
  508 rf_PrintNodeInfoString(RF_DagNode_t *node)
  509 {
  510         RF_PhysDiskAddr_t *pda;
  511         int     (*df) (RF_DagNode_t *) = node->doFunc;
  512         int     i, lk, unlk;
  513         void   *bufPtr;
  514 
  515         if ((df == rf_DiskReadFunc) || (df == rf_DiskWriteFunc)
  516             || (df == rf_DiskReadMirrorIdleFunc)
  517             || (df == rf_DiskReadMirrorPartitionFunc)) {
  518                 pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  519                 bufPtr = (void *) node->params[1].p;
  520                 lk = 0;
  521                 unlk = 0;
  522                 RF_ASSERT(!(lk && unlk));
  523                 printf("c %d offs %ld nsect %d buf 0x%lx %s\n", pda->col,
  524                     (long) pda->startSector, (int) pda->numSector, (long) bufPtr,
  525                     (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
  526                 return;
  527         }
  528         if (df == rf_DiskUnlockFunc) {
  529                 pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  530                 lk = 0;
  531                 unlk = 0;
  532                 RF_ASSERT(!(lk && unlk));
  533                 printf("c %d %s\n", pda->col,
  534                     (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
  535                 return;
  536         }
  537         if ((df == rf_SimpleXorFunc) || (df == rf_RegularXorFunc)
  538             || (df == rf_RecoveryXorFunc)) {
  539                 printf("result buf 0x%lx\n", (long) node->results[0]);
  540                 for (i = 0; i < node->numParams - 1; i += 2) {
  541                         pda = (RF_PhysDiskAddr_t *) node->params[i].p;
  542                         bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
  543                         printf("    buf 0x%lx c%d offs %ld nsect %d\n",
  544                             (long) bufPtr, pda->col,
  545                             (long) pda->startSector, (int) pda->numSector);
  546                 }
  547                 return;
  548         }
  549 #if RF_INCLUDE_PARITYLOGGING > 0
  550         if (df == rf_ParityLogOverwriteFunc || df == rf_ParityLogUpdateFunc) {
  551                 for (i = 0; i < node->numParams - 1; i += 2) {
  552                         pda = (RF_PhysDiskAddr_t *) node->params[i].p;
  553                         bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
  554                         printf(" c%d offs %ld nsect %d buf 0x%lx\n",
  555                             pda->col, (long) pda->startSector,
  556                             (int) pda->numSector, (long) bufPtr);
  557                 }
  558                 return;
  559         }
  560 #endif                          /* RF_INCLUDE_PARITYLOGGING > 0 */
  561 
  562         if ((df == rf_TerminateFunc) || (df == rf_NullNodeFunc)) {
  563                 printf("\n");
  564                 return;
  565         }
  566         printf("?\n");
  567 }
  568 #ifdef DEBUG
  569 static void 
  570 rf_RecurPrintDAG(RF_DagNode_t *node, int depth, int unvisited)
  571 {
  572         char   *anttype;
  573         int     i;
  574 
  575         node->visited = (unvisited) ? 0 : 1;
  576         printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
  577             node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
  578             node->numSuccedents, node->numSuccFired, node->numSuccDone,
  579             node->numAntecedents, node->numAntDone, node->numParams, node->numResults);
  580         for (i = 0; i < node->numSuccedents; i++) {
  581                 printf("%d%s", node->succedents[i]->nodeNum,
  582                     ((i == node->numSuccedents - 1) ? "\0" : " "));
  583         }
  584         printf("} A{");
  585         for (i = 0; i < node->numAntecedents; i++) {
  586                 switch (node->antType[i]) {
  587                 case rf_trueData:
  588                         anttype = "T";
  589                         break;
  590                 case rf_antiData:
  591                         anttype = "A";
  592                         break;
  593                 case rf_outputData:
  594                         anttype = "O";
  595                         break;
  596                 case rf_control:
  597                         anttype = "C";
  598                         break;
  599                 default:
  600                         anttype = "?";
  601                         break;
  602                 }
  603                 printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i == node->numAntecedents - 1) ? "\0" : " ");
  604         }
  605         printf("}; ");
  606         rf_PrintNodeInfoString(node);
  607         for (i = 0; i < node->numSuccedents; i++) {
  608                 if (node->succedents[i]->visited == unvisited)
  609                         rf_RecurPrintDAG(node->succedents[i], depth + 1, unvisited);
  610         }
  611 }
  612 
  613 static void 
  614 rf_PrintDAG(RF_DagHeader_t *dag_h)
  615 {
  616         int     unvisited, i;
  617         char   *status;
  618 
  619         /* set dag status */
  620         switch (dag_h->status) {
  621         case rf_enable:
  622                 status = "enable";
  623                 break;
  624         case rf_rollForward:
  625                 status = "rollForward";
  626                 break;
  627         case rf_rollBackward:
  628                 status = "rollBackward";
  629                 break;
  630         default:
  631                 status = "illegal!";
  632                 break;
  633         }
  634         /* find out if visited bits are currently set or clear */
  635         unvisited = dag_h->succedents[0]->visited;
  636 
  637         printf("DAG type:  %s\n", dag_h->creator);
  638         printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)};  info\n");
  639         printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
  640             status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
  641         for (i = 0; i < dag_h->numSuccedents; i++) {
  642                 printf("%d%s", dag_h->succedents[i]->nodeNum,
  643                     ((i == dag_h->numSuccedents - 1) ? "\0" : " "));
  644         }
  645         printf("};\n");
  646         for (i = 0; i < dag_h->numSuccedents; i++) {
  647                 if (dag_h->succedents[i]->visited == unvisited)
  648                         rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
  649         }
  650 }
  651 #endif
  652 /* assigns node numbers */
  653 int 
  654 rf_AssignNodeNums(RF_DagHeader_t * dag_h)
  655 {
  656         int     unvisited, i, nnum;
  657         RF_DagNode_t *node;
  658 
  659         nnum = 0;
  660         unvisited = dag_h->succedents[0]->visited;
  661 
  662         dag_h->nodeNum = nnum++;
  663         for (i = 0; i < dag_h->numSuccedents; i++) {
  664                 node = dag_h->succedents[i];
  665                 if (node->visited == unvisited) {
  666                         nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
  667                 }
  668         }
  669         return (nnum);
  670 }
  671 
  672 int 
  673 rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited)
  674 {
  675         int     i;
  676 
  677         node->visited = (unvisited) ? 0 : 1;
  678 
  679         node->nodeNum = num++;
  680         for (i = 0; i < node->numSuccedents; i++) {
  681                 if (node->succedents[i]->visited == unvisited) {
  682                         num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
  683                 }
  684         }
  685         return (num);
  686 }
  687 /* set the header pointers in each node to "newptr" */
  688 void 
  689 rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr)
  690 {
  691         int     i;
  692         for (i = 0; i < dag_h->numSuccedents; i++)
  693                 if (dag_h->succedents[i]->dagHdr != newptr)
  694                         rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
  695 }
  696 
  697 void 
  698 rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr)
  699 {
  700         int     i;
  701         node->dagHdr = newptr;
  702         for (i = 0; i < node->numSuccedents; i++)
  703                 if (node->succedents[i]->dagHdr != newptr)
  704                         rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
  705 }
  706 
  707 
  708 void 
  709 rf_PrintDAGList(RF_DagHeader_t * dag_h)
  710 {
  711         int     i = 0;
  712 
  713         for (; dag_h; dag_h = dag_h->next) {
  714                 rf_AssignNodeNums(dag_h);
  715                 printf("\n\nDAG %d IN LIST:\n", i++);
  716                 rf_PrintDAG(dag_h);
  717         }
  718 }
  719 
  720 static int 
  721 rf_ValidateBranch(RF_DagNode_t *node, int *scount, int *acount,
  722                   RF_DagNode_t **nodes, int unvisited)
  723 {
  724         int     i, retcode = 0;
  725 
  726         /* construct an array of node pointers indexed by node num */
  727         node->visited = (unvisited) ? 0 : 1;
  728         nodes[node->nodeNum] = node;
  729 
  730         if (node->next != NULL) {
  731                 printf("INVALID DAG: next pointer in node is not NULL\n");
  732                 retcode = 1;
  733         }
  734         if (node->status != rf_wait) {
  735                 printf("INVALID DAG: Node status is not wait\n");
  736                 retcode = 1;
  737         }
  738         if (node->numAntDone != 0) {
  739                 printf("INVALID DAG: numAntDone is not zero\n");
  740                 retcode = 1;
  741         }
  742         if (node->doFunc == rf_TerminateFunc) {
  743                 if (node->numSuccedents != 0) {
  744                         printf("INVALID DAG: Terminator node has succedents\n");
  745                         retcode = 1;
  746                 }
  747         } else {
  748                 if (node->numSuccedents == 0) {
  749                         printf("INVALID DAG: Non-terminator node has no succedents\n");
  750                         retcode = 1;
  751                 }
  752         }
  753         for (i = 0; i < node->numSuccedents; i++) {
  754                 if (!node->succedents[i]) {
  755                         printf("INVALID DAG: succedent %d of node %s is NULL\n", i, node->name);
  756                         retcode = 1;
  757                 }
  758                 scount[node->succedents[i]->nodeNum]++;
  759         }
  760         for (i = 0; i < node->numAntecedents; i++) {
  761                 if (!node->antecedents[i]) {
  762                         printf("INVALID DAG: antecedent %d of node %s is NULL\n", i, node->name);
  763                         retcode = 1;
  764                 }
  765                 acount[node->antecedents[i]->nodeNum]++;
  766         }
  767         for (i = 0; i < node->numSuccedents; i++) {
  768                 if (node->succedents[i]->visited == unvisited) {
  769                         if (rf_ValidateBranch(node->succedents[i], scount,
  770                                 acount, nodes, unvisited)) {
  771                                 retcode = 1;
  772                         }
  773                 }
  774         }
  775         return (retcode);
  776 }
  777 
  778 static void 
  779 rf_ValidateBranchVisitedBits(RF_DagNode_t *node, int unvisited, int rl)
  780 {
  781         int     i;
  782 
  783         RF_ASSERT(node->visited == unvisited);
  784         for (i = 0; i < node->numSuccedents; i++) {
  785                 if (node->succedents[i] == NULL) {
  786                         printf("node=%lx node->succedents[%d] is NULL\n", (long) node, i);
  787                         RF_ASSERT(0);
  788                 }
  789                 rf_ValidateBranchVisitedBits(node->succedents[i], unvisited, rl + 1);
  790         }
  791 }
  792 /* NOTE:  never call this on a big dag, because it is exponential
  793  * in execution time
  794  */
  795 static void 
  796 rf_ValidateVisitedBits(RF_DagHeader_t *dag)
  797 {
  798         int     i, unvisited;
  799 
  800         unvisited = dag->succedents[0]->visited;
  801 
  802         for (i = 0; i < dag->numSuccedents; i++) {
  803                 if (dag->succedents[i] == NULL) {
  804                         printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
  805                         RF_ASSERT(0);
  806                 }
  807                 rf_ValidateBranchVisitedBits(dag->succedents[i], unvisited, 0);
  808         }
  809 }
  810 /* validate a DAG.  _at entry_ verify that:
  811  *   -- numNodesCompleted is zero
  812  *   -- node queue is null
  813  *   -- dag status is rf_enable
  814  *   -- next pointer is null on every node
  815  *   -- all nodes have status wait
  816  *   -- numAntDone is zero in all nodes
  817  *   -- terminator node has zero successors
  818  *   -- no other node besides terminator has zero successors
  819  *   -- no successor or antecedent pointer in a node is NULL
  820  *   -- number of times that each node appears as a successor of another node
  821  *      is equal to the antecedent count on that node
  822  *   -- number of times that each node appears as an antecedent of another node
  823  *      is equal to the succedent count on that node
  824  *   -- what else?
  825  */
  826 int 
  827 rf_ValidateDAG(RF_DagHeader_t *dag_h)
  828 {
  829         int     i, nodecount;
  830         int    *scount, *acount;/* per-node successor and antecedent counts */
  831         RF_DagNode_t **nodes;   /* array of ptrs to nodes in dag */
  832         int     retcode = 0;
  833         int     unvisited;
  834         int     commitNodeCount = 0;
  835 
  836         if (rf_validateVisitedDebug)
  837                 rf_ValidateVisitedBits(dag_h);
  838 
  839         if (dag_h->numNodesCompleted != 0) {
  840                 printf("INVALID DAG: num nodes completed is %d, should be 0\n", dag_h->numNodesCompleted);
  841                 retcode = 1;
  842                 goto validate_dag_bad;
  843         }
  844         if (dag_h->status != rf_enable) {
  845                 printf("INVALID DAG: not enabled\n");
  846                 retcode = 1;
  847                 goto validate_dag_bad;
  848         }
  849         if (dag_h->numCommits != 0) {
  850                 printf("INVALID DAG: numCommits != 0 (%d)\n", dag_h->numCommits);
  851                 retcode = 1;
  852                 goto validate_dag_bad;
  853         }
  854         if (dag_h->numSuccedents != 1) {
  855                 /* currently, all dags must have only one succedent */
  856                 printf("INVALID DAG: numSuccedents !1 (%d)\n", dag_h->numSuccedents);
  857                 retcode = 1;
  858                 goto validate_dag_bad;
  859         }
  860         nodecount = rf_AssignNodeNums(dag_h);
  861 
  862         unvisited = dag_h->succedents[0]->visited;
  863 
  864         RF_Malloc(scount, nodecount * sizeof(int), (int *));
  865         RF_Malloc(acount, nodecount * sizeof(int), (int *));
  866         RF_Malloc(nodes, nodecount * sizeof(RF_DagNode_t *), 
  867                   (RF_DagNode_t **));
  868         for (i = 0; i < dag_h->numSuccedents; i++) {
  869                 if ((dag_h->succedents[i]->visited == unvisited)
  870                     && rf_ValidateBranch(dag_h->succedents[i], scount,
  871                         acount, nodes, unvisited)) {
  872                         retcode = 1;
  873                 }
  874         }
  875         /* start at 1 to skip the header node */
  876         for (i = 1; i < nodecount; i++) {
  877                 if (nodes[i]->commitNode)
  878                         commitNodeCount++;
  879                 if (nodes[i]->doFunc == NULL) {
  880                         printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
  881                         retcode = 1;
  882                         goto validate_dag_out;
  883                 }
  884                 if (nodes[i]->undoFunc == NULL) {
  885                         printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
  886                         retcode = 1;
  887                         goto validate_dag_out;
  888                 }
  889                 if (nodes[i]->numAntecedents != scount[nodes[i]->nodeNum]) {
  890                         printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
  891                             nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
  892                         retcode = 1;
  893                         goto validate_dag_out;
  894                 }
  895                 if (nodes[i]->numSuccedents != acount[nodes[i]->nodeNum]) {
  896                         printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
  897                             nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
  898                         retcode = 1;
  899                         goto validate_dag_out;
  900                 }
  901         }
  902 
  903         if (dag_h->numCommitNodes != commitNodeCount) {
  904                 printf("INVALID DAG: incorrect commit node count.  hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
  905                     dag_h->numCommitNodes, commitNodeCount);
  906                 retcode = 1;
  907                 goto validate_dag_out;
  908         }
  909 validate_dag_out:
  910         RF_Free(scount, nodecount * sizeof(int));
  911         RF_Free(acount, nodecount * sizeof(int));
  912         RF_Free(nodes, nodecount * sizeof(RF_DagNode_t *));
  913         if (retcode)
  914                 rf_PrintDAGList(dag_h);
  915 
  916         if (rf_validateVisitedDebug)
  917                 rf_ValidateVisitedBits(dag_h);
  918 
  919         return (retcode);
  920 
  921 validate_dag_bad:
  922         rf_PrintDAGList(dag_h);
  923         return (retcode);
  924 }
  925 
  926 #endif /* RF_DEBUG_VALIDATE_DAG */
  927 
  928 /******************************************************************************
  929  *
  930  * misc construction routines
  931  *
  932  *****************************************************************************/
  933 
  934 void 
  935 rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
  936 {
  937         int     ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
  938         int     fcol = raidPtr->reconControl->fcol;
  939         int     scol = raidPtr->reconControl->spareCol;
  940         RF_PhysDiskAddr_t *pda;
  941 
  942         RF_ASSERT(raidPtr->status == rf_rs_reconstructing);
  943         for (pda = asmap->physInfo; pda; pda = pda->next) {
  944                 if (pda->col == fcol) {
  945 #if RF_DEBUG_DAG
  946                         if (rf_dagDebug) {
  947                                 if (!rf_CheckRUReconstructed(raidPtr->reconControl->reconMap,
  948                                         pda->startSector)) {
  949                                         RF_PANIC();
  950                                 }
  951                         }
  952 #endif
  953                         /* printf("Remapped data for large write\n"); */
  954                         if (ds) {
  955                                 raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
  956                                     &pda->col, &pda->startSector, RF_REMAP);
  957                         } else {
  958                                 pda->col = scol;
  959                         }
  960                 }
  961         }
  962         for (pda = asmap->parityInfo; pda; pda = pda->next) {
  963                 if (pda->col == fcol) {
  964 #if RF_DEBUG_DAG
  965                         if (rf_dagDebug) {
  966                                 if (!rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, pda->startSector)) {
  967                                         RF_PANIC();
  968                                 }
  969                         }
  970 #endif
  971                 }
  972                 if (ds) {
  973                         (raidPtr->Layout.map->MapParity) (raidPtr, pda->raidAddress, &pda->col, &pda->startSector, RF_REMAP);
  974                 } else {
  975                         pda->col = scol;
  976                 }
  977         }
  978 }
  979 
  980 
  981 /* this routine allocates read buffers and generates stripe maps for the
  982  * regions of the array from the start of the stripe to the start of the
  983  * access, and from the end of the access to the end of the stripe.  It also
  984  * computes and returns the number of DAG nodes needed to read all this data.
  985  * Note that this routine does the wrong thing if the access is fully
  986  * contained within one stripe unit, so we RF_ASSERT against this case at the
  987  * start.
  988  * 
  989  * layoutPtr - in: layout information
  990  * asmap     - in: access stripe map
  991  * dag_h     - in: header of the dag to create
  992  * new_asm_h - in: ptr to array of 2 headers.  to be filled in
  993  * nRodNodes - out: num nodes to be generated to read unaccessed data
  994  * sosBuffer, eosBuffer - out: pointers to newly allocated buffer
  995  */
  996 void 
  997 rf_MapUnaccessedPortionOfStripe(RF_Raid_t *raidPtr,
  998                                 RF_RaidLayout_t *layoutPtr,
  999                                 RF_AccessStripeMap_t *asmap,
 1000                                 RF_DagHeader_t *dag_h,
 1001                                 RF_AccessStripeMapHeader_t **new_asm_h,
 1002                                 int *nRodNodes, 
 1003                                 char **sosBuffer, char **eosBuffer,
 1004                                 RF_AllocListElem_t *allocList)
 1005 {
 1006         RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
 1007         RF_SectorNum_t sosNumSector, eosNumSector;
 1008 
 1009         RF_ASSERT(asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol / 2));
 1010         /* generate an access map for the region of the array from start of
 1011          * stripe to start of access */
 1012         new_asm_h[0] = new_asm_h[1] = NULL;
 1013         *nRodNodes = 0;
 1014         if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
 1015                 sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
 1016                 sosNumSector = asmap->raidAddress - sosRaidAddress;
 1017                 *sosBuffer = rf_AllocStripeBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, sosNumSector));
 1018                 new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
 1019                 new_asm_h[0]->next = dag_h->asmList;
 1020                 dag_h->asmList = new_asm_h[0];
 1021                 *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
 1022 
 1023                 RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
 1024                 /* we're totally within one stripe here */
 1025                 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
 1026                         rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
 1027         }
 1028         /* generate an access map for the region of the array from end of
 1029          * access to end of stripe */
 1030         if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
 1031                 eosRaidAddress = asmap->endRaidAddress;
 1032                 eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
 1033                 *eosBuffer = rf_AllocStripeBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, eosNumSector));
 1034                 new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
 1035                 new_asm_h[1]->next = dag_h->asmList;
 1036                 dag_h->asmList = new_asm_h[1];
 1037                 *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
 1038 
 1039                 RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
 1040                 /* we're totally within one stripe here */
 1041                 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
 1042                         rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
 1043         }
 1044 }
 1045 
 1046 
 1047 
 1048 /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
 1049 int 
 1050 rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, 
 1051               RF_PhysDiskAddr_t *src, RF_PhysDiskAddr_t *dest)
 1052 {
 1053         RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
 1054         RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
 1055         /* use -1 to be sure we stay within SU */
 1056         RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector - 1);
 1057         RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector - 1);
 1058         return ((RF_MAX(soffs, doffs) <= RF_MIN(send, dend)) ? 1 : 0);
 1059 }
 1060 
 1061 
 1062 /* GenerateFailedAccessASMs
 1063  *
 1064  * this routine figures out what portion of the stripe needs to be read
 1065  * to effect the degraded read or write operation.  It's primary function
 1066  * is to identify everything required to recover the data, and then
 1067  * eliminate anything that is already being accessed by the user.
 1068  *
 1069  * The main result is two new ASMs, one for the region from the start of the
 1070  * stripe to the start of the access, and one for the region from the end of
 1071  * the access to the end of the stripe.  These ASMs describe everything that
 1072  * needs to be read to effect the degraded access.  Other results are:
 1073  *    nXorBufs -- the total number of buffers that need to be XORed together to
 1074  *                recover the lost data,
 1075  *    rpBufPtr -- ptr to a newly-allocated buffer to hold the parity.  If NULL
 1076  *                at entry, not allocated.
 1077  *    overlappingPDAs --
 1078  *                describes which of the non-failed PDAs in the user access
 1079  *                overlap data that needs to be read to effect recovery.
 1080  *                overlappingPDAs[i]==1 if and only if, neglecting the failed
 1081  *                PDA, the ith pda in the input asm overlaps data that needs
 1082  *                to be read for recovery.
 1083  */
 1084  /* in: asm - ASM for the actual access, one stripe only */
 1085  /* in: failedPDA - which component of the access has failed */
 1086  /* in: dag_h - header of the DAG we're going to create */
 1087  /* out: new_asm_h - the two new ASMs */
 1088  /* out: nXorBufs - the total number of xor bufs required */
 1089  /* out: rpBufPtr - a buffer for the parity read */
 1090 void 
 1091 rf_GenerateFailedAccessASMs(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
 1092                             RF_PhysDiskAddr_t *failedPDA,
 1093                             RF_DagHeader_t *dag_h,
 1094                             RF_AccessStripeMapHeader_t **new_asm_h,
 1095                             int *nXorBufs, char **rpBufPtr,
 1096                             char *overlappingPDAs,
 1097                             RF_AllocListElem_t *allocList)
 1098 {
 1099         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
 1100 
 1101         /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
 1102         RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
 1103         RF_PhysDiskAddr_t *pda;
 1104         int     foundit, i;
 1105 
 1106         foundit = 0;
 1107         /* first compute the following raid addresses: start of stripe,
 1108          * (sosAddr) MIN(start of access, start of failed SU),   (sosEndAddr)
 1109          * MAX(end of access, end of failed SU),       (eosStartAddr) end of
 1110          * stripe (i.e. start of next stripe)   (eosAddr) */
 1111         sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
 1112         sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
 1113         eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
 1114         eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
 1115 
 1116         /* now generate access stripe maps for each of the above regions of
 1117          * the stripe.  Use a dummy (NULL) buf ptr for now */
 1118 
 1119         new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr - sosAddr, NULL, RF_DONT_REMAP) : NULL;
 1120         new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr - eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
 1121 
 1122         /* walk through the PDAs and range-restrict each SU to the region of
 1123          * the SU touched on the failed PDA.  also compute total data buffer
 1124          * space requirements in this step.  Ignore the parity for now. */
 1125         /* Also count nodes to find out how many bufs need to be xored together */
 1126         (*nXorBufs) = 1;        /* in read case, 1 is for parity.  In write
 1127                                  * case, 1 is for failed data */
 1128 
 1129         if (new_asm_h[0]) {
 1130                 new_asm_h[0]->next = dag_h->asmList;
 1131                 dag_h->asmList = new_asm_h[0];
 1132                 for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
 1133                         rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_NOBUFFER, 0);
 1134                         pda->bufPtr = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);
 1135                 }
 1136                 (*nXorBufs) += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
 1137         }
 1138         if (new_asm_h[1]) {
 1139                 new_asm_h[1]->next = dag_h->asmList;
 1140                 dag_h->asmList = new_asm_h[1];
 1141                 for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
 1142                         rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_NOBUFFER, 0);
 1143                         pda->bufPtr = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);
 1144                 }
 1145                 (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
 1146         }
 1147         
 1148         /* allocate a buffer for parity */
 1149         if (rpBufPtr) 
 1150                 *rpBufPtr = rf_AllocBuffer(raidPtr, dag_h, failedPDA->numSector << raidPtr->logBytesPerSector);
 1151 
 1152         /* the last step is to figure out how many more distinct buffers need
 1153          * to get xor'd to produce the missing unit.  there's one for each
 1154          * user-data read node that overlaps the portion of the failed unit
 1155          * being accessed */
 1156 
 1157         for (foundit = i = 0, pda = asmap->physInfo; pda; i++, pda = pda->next) {
 1158                 if (pda == failedPDA) {
 1159                         i--;
 1160                         foundit = 1;
 1161                         continue;
 1162                 }
 1163                 if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
 1164                         overlappingPDAs[i] = 1;
 1165                         (*nXorBufs)++;
 1166                 }
 1167         }
 1168         if (!foundit) {
 1169                 RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n");
 1170                 RF_ASSERT(0);
 1171         }
 1172 #if RF_DEBUG_DAG
 1173         if (rf_degDagDebug) {
 1174                 if (new_asm_h[0]) {
 1175                         printf("First asm:\n");
 1176                         rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
 1177                 }
 1178                 if (new_asm_h[1]) {
 1179                         printf("Second asm:\n");
 1180                         rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
 1181                 }
 1182         }
 1183 #endif
 1184 }
 1185 
 1186 
 1187 /* adjusts the offset and number of sectors in the destination pda so that
 1188  * it covers at most the region of the SU covered by the source PDA.  This
 1189  * is exclusively a restriction:  the number of sectors indicated by the
 1190  * target PDA can only shrink.
 1191  *
 1192  * For example:  s = sectors within SU indicated by source PDA
 1193  *               d = sectors within SU indicated by dest PDA
 1194  *               r = results, stored in dest PDA
 1195  *
 1196  * |--------------- one stripe unit ---------------------|
 1197  * |           sssssssssssssssssssssssssssssssss         |
 1198  * |    ddddddddddddddddddddddddddddddddddddddddddddd    |
 1199  * |           rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr         |
 1200  *
 1201  * Another example:
 1202  *
 1203  * |--------------- one stripe unit ---------------------|
 1204  * |           sssssssssssssssssssssssssssssssss         |
 1205  * |    ddddddddddddddddddddddd                          |
 1206  * |           rrrrrrrrrrrrrrrr                          |
 1207  *
 1208  */
 1209 void 
 1210 rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src,
 1211                     RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr)
 1212 {
 1213         RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
 1214         RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
 1215         RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
 1216         RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector - 1);    /* use -1 to be sure we
 1217                                                                                                          * stay within SU */
 1218         RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector - 1);
 1219         RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector);  /* stripe unit boundary */
 1220 
 1221         dest->startSector = subAddr + RF_MAX(soffs, doffs);
 1222         dest->numSector = subAddr + RF_MIN(send, dend) + 1 - dest->startSector;
 1223 
 1224         if (dobuffer)
 1225                 dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr, soffs - doffs) : 0;
 1226         if (doraidaddr) {
 1227                 dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
 1228                     rf_StripeUnitOffset(layoutPtr, dest->startSector);
 1229         }
 1230 }
 1231 
 1232 #if (RF_INCLUDE_CHAINDECLUSTER > 0)
 1233 
 1234 /*
 1235  * Want the highest of these primes to be the largest one
 1236  * less than the max expected number of columns (won't hurt
 1237  * to be too small or too large, but won't be optimal, either)
 1238  * --jimz
 1239  */
 1240 #define NLOWPRIMES 8
 1241 static int lowprimes[NLOWPRIMES] = {2, 3, 5, 7, 11, 13, 17, 19};
 1242 /*****************************************************************************
 1243  * compute the workload shift factor.  (chained declustering)
 1244  *
 1245  * return nonzero if access should shift to secondary, otherwise,
 1246  * access is to primary
 1247  *****************************************************************************/
 1248 int 
 1249 rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda)
 1250 {
 1251         /*
 1252          * variables:
 1253          *  d   = column of disk containing primary
 1254          *  f   = column of failed disk
 1255          *  n   = number of disks in array
 1256          *  sd  = "shift distance" (number of columns that d is to the right of f)
 1257          *  v   = numerator of redirection ratio
 1258          *  k   = denominator of redirection ratio
 1259          */
 1260         RF_RowCol_t d, f, sd, n;
 1261         int     k, v, ret, i;
 1262 
 1263         n = raidPtr->numCol;
 1264 
 1265         /* assign column of primary copy to d */
 1266         d = pda->col;
 1267 
 1268         /* assign column of dead disk to f */
 1269         for (f = 0; ((!RF_DEAD_DISK(raidPtr->Disks[f].status)) && (f < n)); f++);
 1270 
 1271         RF_ASSERT(f < n);
 1272         RF_ASSERT(f != d);
 1273 
 1274         sd = (f > d) ? (n + d - f) : (d - f);
 1275         RF_ASSERT(sd < n);
 1276 
 1277         /*
 1278          * v of every k accesses should be redirected
 1279          *
 1280          * v/k := (n-1-sd)/(n-1)
 1281          */
 1282         v = (n - 1 - sd);
 1283         k = (n - 1);
 1284 
 1285 #if 1
 1286         /*
 1287          * XXX
 1288          * Is this worth it?
 1289          *
 1290          * Now reduce the fraction, by repeatedly factoring
 1291          * out primes (just like they teach in elementary school!)
 1292          */
 1293         for (i = 0; i < NLOWPRIMES; i++) {
 1294                 if (lowprimes[i] > v)
 1295                         break;
 1296                 while (((v % lowprimes[i]) == 0) && ((k % lowprimes[i]) == 0)) {
 1297                         v /= lowprimes[i];
 1298                         k /= lowprimes[i];
 1299                 }
 1300         }
 1301 #endif
 1302 
 1303         raidPtr->hist_diskreq[d]++;
 1304         if (raidPtr->hist_diskreq[d] > v) {
 1305                 ret = 0;        /* do not redirect */
 1306         } else {
 1307                 ret = 1;        /* redirect */
 1308         }
 1309 
 1310 #if 0
 1311         printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
 1312             raidPtr->hist_diskreq[d]);
 1313 #endif
 1314 
 1315         if (raidPtr->hist_diskreq[d] >= k) {
 1316                 /* reset counter */
 1317                 raidPtr->hist_diskreq[d] = 0;
 1318         }
 1319         return (ret);
 1320 }
 1321 #endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */
 1322 
 1323 /*
 1324  * Disk selection routines
 1325  */
 1326 
 1327 /*
 1328  * Selects the disk with the shortest queue from a mirror pair.
 1329  * Both the disk I/Os queued in RAIDframe as well as those at the physical
 1330  * disk are counted as members of the "queue"
 1331  */
 1332 void 
 1333 rf_SelectMirrorDiskIdle(RF_DagNode_t * node)
 1334 {
 1335         RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
 1336         RF_RowCol_t colData, colMirror;
 1337         int     dataQueueLength, mirrorQueueLength, usemirror;
 1338         RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
 1339         RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
 1340         RF_PhysDiskAddr_t *tmp_pda;
 1341         RF_RaidDisk_t *disks = raidPtr->Disks;
 1342         RF_DiskQueue_t *dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
 1343 
 1344         /* return the [row col] of the disk with the shortest queue */
 1345         colData = data_pda->col;
 1346         colMirror = mirror_pda->col;
 1347         dataQueue = &(dqs[colData]);
 1348         mirrorQueue = &(dqs[colMirror]);
 1349 
 1350 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
 1351         RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
 1352 #endif                          /* RF_LOCK_QUEUES_TO_READ_LEN */
 1353         dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
 1354 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
 1355         RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
 1356         RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
 1357 #endif                          /* RF_LOCK_QUEUES_TO_READ_LEN */
 1358         mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
 1359 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
 1360         RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
 1361 #endif                          /* RF_LOCK_QUEUES_TO_READ_LEN */
 1362 
 1363         usemirror = 0;
 1364         if (RF_DEAD_DISK(disks[colMirror].status)) {
 1365                 usemirror = 0;
 1366         } else
 1367                 if (RF_DEAD_DISK(disks[colData].status)) {
 1368                         usemirror = 1;
 1369                 } else
 1370                         if (raidPtr->parity_good == RF_RAID_DIRTY) {
 1371                                 /* Trust only the main disk */
 1372                                 usemirror = 0;
 1373                         } else
 1374                                 if (dataQueueLength < mirrorQueueLength) {
 1375                                         usemirror = 0;
 1376                                 } else
 1377                                         if (mirrorQueueLength < dataQueueLength) {
 1378                                                 usemirror = 1;
 1379                                         } else {
 1380                                                 /* queues are equal length. attempt
 1381                                                  * cleverness. */
 1382                                                 if (SNUM_DIFF(dataQueue->last_deq_sector, data_pda->startSector)
 1383                                                     <= SNUM_DIFF(mirrorQueue->last_deq_sector, mirror_pda->startSector)) {
 1384                                                         usemirror = 0;
 1385                                                 } else {
 1386                                                         usemirror = 1;
 1387                                                 }
 1388                                         }
 1389 
 1390         if (usemirror) {
 1391                 /* use mirror (parity) disk, swap params 0 & 4 */
 1392                 tmp_pda = data_pda;
 1393                 node->params[0].p = mirror_pda;
 1394                 node->params[4].p = tmp_pda;
 1395         } else {
 1396                 /* use data disk, leave param 0 unchanged */
 1397         }
 1398         /* printf("dataQueueLength %d, mirrorQueueLength
 1399          * %d\n",dataQueueLength, mirrorQueueLength); */
 1400 }
 1401 #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0) || (RF_DEBUG_VALIDATE_DAG > 0)
 1402 /*
 1403  * Do simple partitioning. This assumes that
 1404  * the data and parity disks are laid out identically.
 1405  */
 1406 void 
 1407 rf_SelectMirrorDiskPartition(RF_DagNode_t * node)
 1408 {
 1409         RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
 1410         RF_RowCol_t colData, colMirror;
 1411         RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
 1412         RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
 1413         RF_PhysDiskAddr_t *tmp_pda;
 1414         RF_RaidDisk_t *disks = raidPtr->Disks;
 1415         int     usemirror;
 1416 
 1417         /* return the [row col] of the disk with the shortest queue */
 1418         colData = data_pda->col;
 1419         colMirror = mirror_pda->col;
 1420 
 1421         usemirror = 0;
 1422         if (RF_DEAD_DISK(disks[colMirror].status)) {
 1423                 usemirror = 0;
 1424         } else
 1425                 if (RF_DEAD_DISK(disks[colData].status)) {
 1426                         usemirror = 1;
 1427                 } else 
 1428                         if (raidPtr->parity_good == RF_RAID_DIRTY) {
 1429                                 /* Trust only the main disk */
 1430                                 usemirror = 0;
 1431                         } else
 1432                                 if (data_pda->startSector < 
 1433                                     (disks[colData].numBlocks / 2)) {
 1434                                         usemirror = 0;
 1435                                 } else {
 1436                                         usemirror = 1;
 1437                                 }
 1438 
 1439         if (usemirror) {
 1440                 /* use mirror (parity) disk, swap params 0 & 4 */
 1441                 tmp_pda = data_pda;
 1442                 node->params[0].p = mirror_pda;
 1443                 node->params[4].p = tmp_pda;
 1444         } else {
 1445                 /* use data disk, leave param 0 unchanged */
 1446         }
 1447 }
 1448 #endif

Cache object: 32edfc493e66311859d1a4c02815ae47


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.