The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/fs/jfs/jfs_dmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  *   Copyright (c) International Business Machines Corp., 2000-2002
    3  *
    4  *   This program is free software;  you can redistribute it and/or modify
    5  *   it under the terms of the GNU General Public License as published by
    6  *   the Free Software Foundation; either version 2 of the License, or 
    7  *   (at your option) any later version.
    8  * 
    9  *   This program is distributed in the hope that it will be useful,
   10  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
   11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
   12  *   the GNU General Public License for more details.
   13  *
   14  *   You should have received a copy of the GNU General Public License
   15  *   along with this program;  if not, write to the Free Software 
   16  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
   17  */
   18 
   19 #include <linux/fs.h>
   20 #include "jfs_incore.h"
   21 #include "jfs_dmap.h"
   22 #include "jfs_imap.h"
   23 #include "jfs_lock.h"
   24 #include "jfs_metapage.h"
   25 #include "jfs_superblock.h"
   26 #include "jfs_debug.h"
   27 
   28 /*
   29  *      Debug code for double-checking block map
   30  */
   31 /* #define      _JFS_DEBUG_DMAP 1 */
   32 
   33 #ifdef  _JFS_DEBUG_DMAP
   34 #define DBINITMAP(size,ipbmap,results) \
   35         DBinitmap(size,ipbmap,results)
   36 #define DBALLOC(dbmap,mapsize,blkno,nblocks) \
   37         DBAlloc(dbmap,mapsize,blkno,nblocks)
   38 #define DBFREE(dbmap,mapsize,blkno,nblocks) \
   39         DBFree(dbmap,mapsize,blkno,nblocks)
   40 #define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \
   41         DBAllocCK(dbmap,mapsize,blkno,nblocks)
   42 #define DBFREECK(dbmap,mapsize,blkno,nblocks) \
   43         DBFreeCK(dbmap,mapsize,blkno,nblocks)
   44 
   45 static void DBinitmap(s64, struct inode *, u32 **);
   46 static void DBAlloc(uint *, s64, s64, s64);
   47 static void DBFree(uint *, s64, s64, s64);
   48 static void DBAllocCK(uint *, s64, s64, s64);
   49 static void DBFreeCK(uint *, s64, s64, s64);
   50 #else
   51 #define DBINITMAP(size,ipbmap,results)
   52 #define DBALLOC(dbmap, mapsize, blkno, nblocks)
   53 #define DBFREE(dbmap, mapsize, blkno, nblocks)
   54 #define DBALLOCCK(dbmap, mapsize, blkno, nblocks)
   55 #define DBFREECK(dbmap, mapsize, blkno, nblocks)
   56 #endif                          /* _JFS_DEBUG_DMAP */
   57 
   58 /*
   59  *      SERIALIZATION of the Block Allocation Map.
   60  *
   61  *      the working state of the block allocation map is accessed in
   62  *      two directions:
   63  *      
   64  *      1) allocation and free requests that start at the dmap
   65  *         level and move up through the dmap control pages (i.e.
   66  *         the vast majority of requests).
   67  * 
   68  *      2) allocation requests that start at dmap control page
   69  *         level and work down towards the dmaps.
   70  *      
   71  *      the serialization scheme used here is as follows. 
   72  *
   73  *      requests which start at the bottom are serialized against each 
   74  *      other through buffers and each requests holds onto its buffers 
   75  *      as it works it way up from a single dmap to the required level 
   76  *      of dmap control page.
   77  *      requests that start at the top are serialized against each other
   78  *      and request that start from the bottom by the multiple read/single
   79  *      write inode lock of the bmap inode. requests starting at the top
   80  *      take this lock in write mode while request starting at the bottom
   81  *      take the lock in read mode.  a single top-down request may proceed
   82  *      exclusively while multiple bottoms-up requests may proceed 
   83  *      simultaneously (under the protection of busy buffers).
   84  *      
   85  *      in addition to information found in dmaps and dmap control pages,
   86  *      the working state of the block allocation map also includes read/
   87  *      write information maintained in the bmap descriptor (i.e. total
   88  *      free block count, allocation group level free block counts).
   89  *      a single exclusive lock (BMAP_LOCK) is used to guard this information
   90  *      in the face of multiple-bottoms up requests.
   91  *      (lock ordering: IREAD_LOCK, BMAP_LOCK);
   92  *      
   93  *      accesses to the persistent state of the block allocation map (limited
   94  *      to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
   95  */
   96 
   97 #define BMAP_LOCK_INIT(bmp)     init_MUTEX(&bmp->db_bmaplock)
   98 #define BMAP_LOCK(bmp)          down(&bmp->db_bmaplock)
   99 #define BMAP_UNLOCK(bmp)        up(&bmp->db_bmaplock)
  100 
  101 /*
  102  * forward references
  103  */
  104 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
  105                         int nblocks);
  106 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
  107 static void dbBackSplit(dmtree_t * tp, int leafno);
  108 static void dbJoin(dmtree_t * tp, int leafno, int newval);
  109 static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
  110 static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
  111                     int level);
  112 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
  113 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
  114                        int nblocks);
  115 static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
  116                        int nblocks,
  117                        int l2nb, s64 * results);
  118 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
  119                        int nblocks);
  120 static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
  121                           int l2nb,
  122                           s64 * results);
  123 static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
  124                      s64 * results);
  125 static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
  126                       s64 * results);
  127 int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
  128 static int dbFindBits(u32 word, int l2nb);
  129 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
  130 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
  131 static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
  132                        int nblocks);
  133 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
  134                       int nblocks);
  135 static int dbMaxBud(u8 * cp);
  136 s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
  137 int blkstol2(s64 nb);
  138 void fsDirty(void);
  139 
  140 int cntlz(u32 value);
  141 int cnttz(u32 word);
  142 
  143 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
  144                          int nblocks);
  145 static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
  146 static int dbInitDmapTree(struct dmap * dp);
  147 static int dbInitTree(struct dmaptree * dtp);
  148 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
  149 static int dbGetL2AGSize(s64 nblocks);
  150 
  151 /*
  152  *      buddy table
  153  *
  154  * table used for determining buddy sizes within characters of 
  155  * dmap bitmap words.  the characters themselves serve as indexes
  156  * into the table, with the table elements yielding the maximum
  157  * binary buddy of free bits within the character.
  158  */
  159 signed char budtab[256] = {
  160         3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  161         2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  162         2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  163         2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  164         2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  165         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  166         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  167         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  168         2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  169         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  170         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  171         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  172         2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  173         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  174         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
  175         2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
  176 };
  177 
  178 
  179 /*
  180  * NAME:        dbMount()
  181  *
  182  * FUNCTION:    initializate the block allocation map.
  183  *
  184  *              memory is allocated for the in-core bmap descriptor and
  185  *              the in-core descriptor is initialized from disk.
  186  *
  187  * PARAMETERS:
  188  *      ipbmap  -  pointer to in-core inode for the block map.
  189  *
  190  * RETURN VALUES:
  191  *      0       - success
  192  *      ENOMEM  - insufficient memory
  193  *      EIO     - i/o error
  194  */
  195 int dbMount(struct inode *ipbmap)
  196 {
  197         struct bmap *bmp;
  198         struct dbmap *dbmp_le;
  199         struct metapage *mp;
  200         int i;
  201 
  202         /*
  203          * allocate/initialize the in-memory bmap descriptor
  204          */
  205         /* allocate memory for the in-memory bmap descriptor */
  206         bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
  207         if (bmp == NULL)
  208                 return (ENOMEM);
  209 
  210         /* read the on-disk bmap descriptor. */
  211         mp = read_metapage(ipbmap,
  212                            BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
  213                            PSIZE, 0);
  214         if (mp == NULL) {
  215                 kfree(bmp);
  216                 return (EIO);
  217         }
  218 
  219         /* copy the on-disk bmap descriptor to its in-memory version. */
  220         dbmp_le = (struct dbmap *) mp->data;
  221         bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
  222         bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
  223         bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
  224         bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
  225         bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
  226         bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
  227         bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
  228         bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
  229         bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
  230         bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
  231         bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
  232         bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
  233         for (i = 0; i < MAXAG; i++)
  234                 bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
  235         bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
  236         bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
  237 
  238         /* release the buffer. */
  239         release_metapage(mp);
  240 
  241         /* bind the bmap inode and the bmap descriptor to each other. */
  242         bmp->db_ipbmap = ipbmap;
  243         JFS_SBI(ipbmap->i_sb)->bmap = bmp;
  244 
  245         memset(bmp->db_active, 0, sizeof(bmp->db_active));
  246         DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
  247 
  248         /*
  249          * allocate/initialize the bmap lock
  250          */
  251         BMAP_LOCK_INIT(bmp);
  252 
  253         return (0);
  254 }
  255 
  256 
  257 /*
  258  * NAME:        dbUnmount()
  259  *
  260  * FUNCTION:    terminate the block allocation map in preparation for
  261  *              file system unmount.
  262  *
  263  *              the in-core bmap descriptor is written to disk and
  264  *              the memory for this descriptor is freed.
  265  *
  266  * PARAMETERS:
  267  *      ipbmap  -  pointer to in-core inode for the block map.
  268  *
  269  * RETURN VALUES:
  270  *      0       - success
  271  *      EIO     - i/o error
  272  */
  273 int dbUnmount(struct inode *ipbmap, int mounterror)
  274 {
  275         struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
  276         int i;
  277 
  278         if (!(mounterror || isReadOnly(ipbmap)))
  279                 dbSync(ipbmap);
  280 
  281         /*
  282          * Invalidate the page cache buffers
  283          */
  284         truncate_inode_pages(ipbmap->i_mapping, 0);
  285 
  286         /*
  287          * Sanity Check
  288          */
  289         for (i = 0; i < bmp->db_numag; i++)
  290                 if (atomic_read(&bmp->db_active[i]))
  291                         printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n",
  292                                i, atomic_read(&bmp->db_active[i]));
  293 
  294         /* free the memory for the in-memory bmap. */
  295         kfree(bmp);
  296 
  297         return (0);
  298 }
  299 
  300 /*
  301  *      dbSync()
  302  */
  303 int dbSync(struct inode *ipbmap)
  304 {
  305         struct dbmap *dbmp_le;
  306         struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
  307         struct metapage *mp;
  308         int i;
  309 
  310         /*
  311          * write bmap global control page
  312          */
  313         /* get the buffer for the on-disk bmap descriptor. */
  314         mp = read_metapage(ipbmap,
  315                            BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
  316                            PSIZE, 0);
  317         if (mp == NULL) {
  318                 jfs_err("dbSync: read_metapage failed!");
  319                 return (EIO);
  320         }
  321         /* copy the in-memory version of the bmap to the on-disk version */
  322         dbmp_le = (struct dbmap *) mp->data;
  323         dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
  324         dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
  325         dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
  326         dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
  327         dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
  328         dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
  329         dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
  330         dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
  331         dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
  332         dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
  333         dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
  334         dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
  335         for (i = 0; i < MAXAG; i++)
  336                 dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
  337         dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
  338         dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
  339 
  340         /* write the buffer */
  341         write_metapage(mp);
  342 
  343         /*
  344          * write out dirty pages of bmap
  345          */
  346         fsync_inode_data_buffers(ipbmap);
  347 
  348         ipbmap->i_state |= I_DIRTY;
  349         diWriteSpecial(ipbmap, 0);
  350 
  351         return (0);
  352 }
  353 
  354 
  355 /*
  356  * NAME:        dbFree()
  357  *
  358  * FUNCTION:    free the specified block range from the working block
  359  *              allocation map.
  360  *
  361  *              the blocks will be free from the working map one dmap
  362  *              at a time.
  363  *
  364  * PARAMETERS:
  365  *      ip      -  pointer to in-core inode;
  366  *      blkno   -  starting block number to be freed.
  367  *      nblocks -  number of blocks to be freed.
  368  *
  369  * RETURN VALUES:
  370  *      0       - success
  371  *      EIO     - i/o error
  372  */
  373 int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
  374 {
  375         struct metapage *mp;
  376         struct dmap *dp;
  377         int nb, rc;
  378         s64 lblkno, rem;
  379         struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
  380         struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
  381 
  382         IREAD_LOCK(ipbmap);
  383 
  384         /* block to be freed better be within the mapsize. */
  385         if (unlikely(blkno + nblocks > bmp->db_mapsize)) {
  386                 /*
  387                  * Trying to catch a bug here
  388                  */
  389                 printk(KERN_ERR
  390                        "JFS: dbFree asked to free block larger than mapsize\n");
  391                 printk(KERN_ERR
  392                        "blkno = 0x%Lx, nblocks = 0x%Lx, mapsize = 0x%Lx\n",
  393                        blkno, nblocks, bmp->db_mapsize);
  394                 printk(KERN_ERR "ino = %ld, cflag = %lx\n", ip->i_ino,
  395                        JFS_IP(ip)->cflag);
  396                 dump_stack();
  397                 /* Make sure fsck fixes things back up */
  398                 updateSuper(ip->i_sb, FM_DIRTY);
  399                 return -EIO;    /* Nobody checks the return code */
  400         }
  401 
  402         /*
  403          * free the blocks a dmap at a time.
  404          */
  405         mp = NULL;
  406         for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
  407                 /* release previous dmap if any */
  408                 if (mp) {
  409                         write_metapage(mp);
  410                 }
  411 
  412                 /* get the buffer for the current dmap. */
  413                 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
  414                 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
  415                 if (mp == NULL) {
  416                         IREAD_UNLOCK(ipbmap);
  417                         return (EIO);
  418                 }
  419                 dp = (struct dmap *) mp->data;
  420 
  421                 /* determine the number of blocks to be freed from
  422                  * this dmap.
  423                  */
  424                 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
  425 
  426                 DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
  427 
  428                 /* free the blocks. */
  429                 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
  430                         release_metapage(mp);
  431                         IREAD_UNLOCK(ipbmap);
  432                         return (rc);
  433                 }
  434 
  435                 DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
  436         }
  437 
  438         /* write the last buffer. */
  439         write_metapage(mp);
  440 
  441         IREAD_UNLOCK(ipbmap);
  442 
  443         return (0);
  444 }
  445 
  446 
  447 /*
  448  * NAME:        dbUpdatePMap()
  449  *
  450  * FUNCTION:    update the allocation state (free or allocate) of the
  451  *              specified block range in the persistent block allocation map.
  452  *              
  453  *              the blocks will be updated in the persistent map one
  454  *              dmap at a time.
  455  *
  456  * PARAMETERS:
  457  *      ipbmap  -  pointer to in-core inode for the block map.
  458  *      free    - TRUE if block range is to be freed from the persistent
  459  *                map; FALSE if it is to   be allocated.
  460  *      blkno   -  starting block number of the range.
  461  *      nblocks -  number of contiguous blocks in the range.
  462  *      tblk    -  transaction block;
  463  *
  464  * RETURN VALUES:
  465  *      0       - success
  466  *      EIO     - i/o error
  467  */
  468 int
  469 dbUpdatePMap(struct inode *ipbmap,
  470              int free, s64 blkno, s64 nblocks, struct tblock * tblk)
  471 {
  472         int nblks, dbitno, wbitno, rbits;
  473         int word, nbits, nwords;
  474         struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
  475         s64 lblkno, rem, lastlblkno;
  476         u32 mask;
  477         struct dmap *dp;
  478         struct metapage *mp;
  479         struct jfs_log *log;
  480         int lsn, difft, diffp;
  481 
  482         /* the blocks better be within the mapsize. */
  483         assert(blkno + nblocks <= bmp->db_mapsize);
  484 
  485         /* compute delta of transaction lsn from log syncpt */
  486         lsn = tblk->lsn;
  487         log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
  488         logdiff(difft, lsn, log);
  489 
  490         /*
  491          * update the block state a dmap at a time.
  492          */
  493         mp = NULL;
  494         lastlblkno = 0;
  495         for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
  496                 /* get the buffer for the current dmap. */
  497                 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
  498                 if (lblkno != lastlblkno) {
  499                         if (mp) {
  500                                 write_metapage(mp);
  501                         }
  502 
  503                         mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
  504                                            0);
  505                         if (mp == NULL)
  506                                 return (EIO);
  507                 }
  508                 dp = (struct dmap *) mp->data;
  509 
  510                 /* determine the bit number and word within the dmap of
  511                  * the starting block.  also determine how many blocks
  512                  * are to be updated within this dmap.
  513                  */
  514                 dbitno = blkno & (BPERDMAP - 1);
  515                 word = dbitno >> L2DBWORD;
  516                 nblks = min(rem, (s64)BPERDMAP - dbitno);
  517 
  518                 /* update the bits of the dmap words. the first and last
  519                  * words may only have a subset of their bits updated. if
  520                  * this is the case, we'll work against that word (i.e.
  521                  * partial first and/or last) only in a single pass.  a 
  522                  * single pass will also be used to update all words that
  523                  * are to have all their bits updated.
  524                  */
  525                 for (rbits = nblks; rbits > 0;
  526                      rbits -= nbits, dbitno += nbits) {
  527                         /* determine the bit number within the word and
  528                          * the number of bits within the word.
  529                          */
  530                         wbitno = dbitno & (DBWORD - 1);
  531                         nbits = min(rbits, DBWORD - wbitno);
  532 
  533                         /* check if only part of the word is to be updated. */
  534                         if (nbits < DBWORD) {
  535                                 /* update (free or allocate) the bits
  536                                  * in this word.
  537                                  */
  538                                 mask =
  539                                     (ONES << (DBWORD - nbits) >> wbitno);
  540                                 if (free)
  541                                         dp->pmap[word] &=
  542                                             cpu_to_le32(~mask);
  543                                 else
  544                                         dp->pmap[word] |=
  545                                             cpu_to_le32(mask);
  546 
  547                                 word += 1;
  548                         } else {
  549                                 /* one or more words are to have all
  550                                  * their bits updated.  determine how
  551                                  * many words and how many bits.
  552                                  */
  553                                 nwords = rbits >> L2DBWORD;
  554                                 nbits = nwords << L2DBWORD;
  555 
  556                                 /* update (free or allocate) the bits
  557                                  * in these words.
  558                                  */
  559                                 if (free)
  560                                         memset(&dp->pmap[word], 0,
  561                                                nwords * 4);
  562                                 else
  563                                         memset(&dp->pmap[word], (int) ONES,
  564                                                nwords * 4);
  565 
  566                                 word += nwords;
  567                         }
  568                 }
  569 
  570                 /*
  571                  * update dmap lsn
  572                  */
  573                 if (lblkno == lastlblkno)
  574                         continue;
  575 
  576                 lastlblkno = lblkno;
  577 
  578                 if (mp->lsn != 0) {
  579                         /* inherit older/smaller lsn */
  580                         logdiff(diffp, mp->lsn, log);
  581                         if (difft < diffp) {
  582                                 mp->lsn = lsn;
  583 
  584                                 /* move bp after tblock in logsync list */
  585                                 LOGSYNC_LOCK(log);
  586                                 list_del(&mp->synclist);
  587                                 list_add(&mp->synclist, &tblk->synclist);
  588                                 LOGSYNC_UNLOCK(log);
  589                         }
  590 
  591                         /* inherit younger/larger clsn */
  592                         LOGSYNC_LOCK(log);
  593                         logdiff(difft, tblk->clsn, log);
  594                         logdiff(diffp, mp->clsn, log);
  595                         if (difft > diffp)
  596                                 mp->clsn = tblk->clsn;
  597                         LOGSYNC_UNLOCK(log);
  598                 } else {
  599                         mp->log = log;
  600                         mp->lsn = lsn;
  601 
  602                         /* insert bp after tblock in logsync list */
  603                         LOGSYNC_LOCK(log);
  604 
  605                         log->count++;
  606                         list_add(&mp->synclist, &tblk->synclist);
  607 
  608                         mp->clsn = tblk->clsn;
  609                         LOGSYNC_UNLOCK(log);
  610                 }
  611         }
  612 
  613         /* write the last buffer. */
  614         if (mp) {
  615                 write_metapage(mp);
  616         }
  617 
  618         return (0);
  619 }
  620 
  621 
  622 /*
  623  * NAME:        dbNextAG()
  624  *
  625  * FUNCTION:    find the preferred allocation group for new allocations.
  626  *
  627  *              Within the allocation groups, we maintain a preferred
  628  *              allocation group which consists of a group with at least
  629  *              average free space.  It is the preferred group that we target
  630  *              new inode allocation towards.  The tie-in between inode
  631  *              allocation and block allocation occurs as we allocate the
  632  *              first (data) block of an inode and specify the inode (block)
  633  *              as the allocation hint for this block.
  634  *
  635  *              We try to avoid having more than one open file growing in
  636  *              an allocation group, as this will lead to fragmentation.
  637  *              This differs from the old OS/2 method of trying to keep
  638  *              empty ags around for large allocations.
  639  *
  640  * PARAMETERS:
  641  *      ipbmap  -  pointer to in-core inode for the block map.
  642  *
  643  * RETURN VALUES:
  644  *      the preferred allocation group number.
  645  */
  646 int dbNextAG(struct inode *ipbmap)
  647 {
  648         s64 avgfree;
  649         int agpref;
  650         s64 hwm = 0;
  651         int i;
  652         int next_best = -1;
  653         struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
  654 
  655         BMAP_LOCK(bmp);
  656 
  657         /* determine the average number of free blocks within the ags. */
  658         avgfree = (u32)bmp->db_nfree / bmp->db_numag;
  659 
  660         /*
  661          * if the current preferred ag does not have an active allocator
  662          * and has at least average freespace, return it
  663          */
  664         agpref = bmp->db_agpref;
  665         if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
  666             (bmp->db_agfree[agpref] >= avgfree))
  667                 goto unlock;
  668 
  669         /* From the last preferred ag, find the next one with at least
  670          * average free space.
  671          */
  672         for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
  673                 if (agpref == bmp->db_numag)
  674                         agpref = 0;
  675 
  676                 if (atomic_read(&bmp->db_active[agpref]))
  677                         /* open file is currently growing in this ag */
  678                         continue;
  679                 if (bmp->db_agfree[agpref] >= avgfree) {
  680                         /* Return this one */
  681                         bmp->db_agpref = agpref;
  682                         goto unlock;
  683                 } else if (bmp->db_agfree[agpref] > hwm) {
  684                         /* Less than avg. freespace, but best so far */
  685                         hwm = bmp->db_agfree[agpref];
  686                         next_best = agpref;
  687                 }
  688         }
  689 
  690         /*
  691          * If no inactive ag was found with average freespace, use the
  692          * next best
  693          */
  694         if (next_best != -1)
  695                 bmp->db_agpref = next_best;
  696         /* else leave db_agpref unchanged */
  697 unlock:
  698         BMAP_UNLOCK(bmp);
  699 
  700         /* return the preferred group.
  701          */
  702         return (bmp->db_agpref);
  703 }
  704 
  705 /*
  706  * NAME:        dbAlloc()
  707  *
  708  * FUNCTION:    attempt to allocate a specified number of contiguous free
  709  *              blocks from the working allocation block map.
  710  *
  711  *              the block allocation policy uses hints and a multi-step
  712  *              approach.
  713  *
  714  *              for allocation requests smaller than the number of blocks
  715  *              per dmap, we first try to allocate the new blocks
  716  *              immediately following the hint.  if these blocks are not
  717  *              available, we try to allocate blocks near the hint.  if
  718  *              no blocks near the hint are available, we next try to 
  719  *              allocate within the same dmap as contains the hint.
  720  *
  721  *              if no blocks are available in the dmap or the allocation
  722  *              request is larger than the dmap size, we try to allocate
  723  *              within the same allocation group as contains the hint. if
  724  *              this does not succeed, we finally try to allocate anywhere
  725  *              within the aggregate.
  726  *
  727  *              we also try to allocate anywhere within the aggregate for
  728  *              for allocation requests larger than the allocation group
  729  *              size or requests that specify no hint value.
  730  *
  731  * PARAMETERS:
  732  *      ip      -  pointer to in-core inode;
  733  *      hint    - allocation hint.
  734  *      nblocks - number of contiguous blocks in the range.
  735  *      results - on successful return, set to the starting block number
  736  *                of the newly allocated contiguous range.
  737  *
  738  * RETURN VALUES:
  739  *      0       - success
  740  *      ENOSPC  - insufficient disk resources
  741  *      EIO     - i/o error
  742  */
  743 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
  744 {
  745         int rc, agno;
  746         struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
  747         struct bmap *bmp;
  748         struct metapage *mp;
  749         s64 lblkno, blkno;
  750         struct dmap *dp;
  751         int l2nb;
  752         s64 mapSize;
  753         int writers;
  754 
  755         /* assert that nblocks is valid */
  756         assert(nblocks > 0);
  757 
  758 #ifdef _STILL_TO_PORT
  759         /* DASD limit check                                     F226941 */
  760         if (OVER_LIMIT(ip, nblocks))
  761                 return ENOSPC;
  762 #endif                          /* _STILL_TO_PORT */
  763 
  764         /* get the log2 number of blocks to be allocated.
  765          * if the number of blocks is not a log2 multiple, 
  766          * it will be rounded up to the next log2 multiple.
  767          */
  768         l2nb = BLKSTOL2(nblocks);
  769 
  770         bmp = JFS_SBI(ip->i_sb)->bmap;
  771 
  772 //retry:        /* serialize w.r.t.extendfs() */
  773         mapSize = bmp->db_mapsize;
  774 
  775         /* the hint should be within the map */
  776         assert(hint < mapSize);
  777 
  778         /* if the number of blocks to be allocated is greater than the
  779          * allocation group size, try to allocate anywhere.
  780          */
  781         if (l2nb > bmp->db_agl2size) {
  782                 IWRITE_LOCK(ipbmap);
  783 
  784                 rc = dbAllocAny(bmp, nblocks, l2nb, results);
  785                 if (rc == 0) {
  786                         DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results,
  787                                 nblocks);
  788                 }
  789 
  790                 goto write_unlock;
  791         }
  792 
  793         /*
  794          * If no hint, let dbNextAG recommend an allocation group
  795          */
  796         if (hint == 0)
  797                 goto pref_ag;
  798 
  799         /* we would like to allocate close to the hint.  adjust the
  800          * hint to the block following the hint since the allocators
  801          * will start looking for free space starting at this point.
  802          */
  803         blkno = hint + 1;
  804 
  805         if (blkno >= bmp->db_mapsize)
  806                 goto pref_ag;
  807 
  808         agno = blkno >> bmp->db_agl2size;
  809 
  810         /* check if blkno crosses over into a new allocation group.
  811          * if so, check if we should allow allocations within this
  812          * allocation group.
  813          */
  814         if ((blkno & (bmp->db_agsize - 1)) == 0)
  815                 /* check if the AG is currenly being written to.
  816                  * if so, call dbNextAG() to find a non-busy
  817                  * AG with sufficient free space.
  818                  */
  819                 if (atomic_read(&bmp->db_active[agno]))
  820                         goto pref_ag;
  821 
  822         /* check if the allocation request size can be satisfied from a
  823          * single dmap.  if so, try to allocate from the dmap containing
  824          * the hint using a tiered strategy.
  825          */
  826         if (nblocks <= BPERDMAP) {
  827                 IREAD_LOCK(ipbmap);
  828 
  829                 /* get the buffer for the dmap containing the hint.
  830                  */
  831                 rc = EIO;
  832                 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
  833                 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
  834                 if (mp == NULL)
  835                         goto read_unlock;
  836 
  837                 dp = (struct dmap *) mp->data;
  838 
  839                 /* first, try to satisfy the allocation request with the
  840                  * blocks beginning at the hint.
  841                  */
  842                 if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
  843                     != ENOSPC) {
  844                         if (rc == 0) {
  845                                 *results = blkno;
  846                                 DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
  847                                         *results, nblocks);
  848                                 mark_metapage_dirty(mp);
  849                         }
  850 
  851                         release_metapage(mp);
  852                         goto read_unlock;
  853                 }
  854 
  855                 writers = atomic_read(&bmp->db_active[agno]);
  856                 if ((writers > 1) ||
  857                     ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
  858                         /*
  859                          * Someone else is writing in this allocation
  860                          * group.  To avoid fragmenting, try another ag
  861                          */
  862                         release_metapage(mp);
  863                         IREAD_UNLOCK(ipbmap);
  864                         goto pref_ag;
  865                 }
  866 
  867                 /* next, try to satisfy the allocation request with blocks
  868                  * near the hint.
  869                  */
  870                 if ((rc =
  871                      dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
  872                     != ENOSPC) {
  873                         if (rc == 0) {
  874                                 DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
  875                                         *results, nblocks);
  876                                 mark_metapage_dirty(mp);
  877                         }
  878 
  879                         release_metapage(mp);
  880                         goto read_unlock;
  881                 }
  882 
  883                 /* try to satisfy the allocation request with blocks within
  884                  * the same dmap as the hint.
  885                  */
  886                 if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
  887                     != ENOSPC) {
  888                         if (rc == 0) {
  889                                 DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
  890                                         *results, nblocks);
  891                                 mark_metapage_dirty(mp);
  892                         }
  893 
  894                         release_metapage(mp);
  895                         goto read_unlock;
  896                 }
  897 
  898                 release_metapage(mp);
  899                 IREAD_UNLOCK(ipbmap);
  900         }
  901 
  902         /* try to satisfy the allocation request with blocks within
  903          * the same allocation group as the hint.
  904          */
  905         IWRITE_LOCK(ipbmap);
  906         if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results))
  907             != ENOSPC) {
  908                 if (rc == 0)
  909                         DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
  910                                 *results, nblocks);
  911                 goto write_unlock;
  912         }
  913         IWRITE_UNLOCK(ipbmap);
  914 
  915 
  916       pref_ag:
  917         /*
  918          * Let dbNextAG recommend a preferred allocation group
  919          */
  920         agno = dbNextAG(ipbmap);
  921         IWRITE_LOCK(ipbmap);
  922 
  923         /* Try to allocate within this allocation group.  if that fails, try to
  924          * allocate anywhere in the map.
  925          */
  926         if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == ENOSPC)
  927                 rc = dbAllocAny(bmp, nblocks, l2nb, results);
  928         if (rc == 0) {
  929                 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks);
  930         }
  931 
  932       write_unlock:
  933         IWRITE_UNLOCK(ipbmap);
  934 
  935         return (rc);
  936 
  937       read_unlock:
  938         IREAD_UNLOCK(ipbmap);
  939 
  940         return (rc);
  941 }
  942 
  943 
  944 /*
  945  * NAME:        dbAllocExact()
  946  *
  947  * FUNCTION:    try to allocate the requested extent;
  948  *
  949  * PARAMETERS:
  950  *      ip      - pointer to in-core inode;
  951  *      blkno   - extent address;
  952  *      nblocks - extent length;
  953  *
  954  * RETURN VALUES:
  955  *      0       - success
  956  *      ENOSPC  - insufficient disk resources
  957  *      EIO     - i/o error
  958  */
  959 int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
  960 {
  961         int rc;
  962         struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
  963         struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
  964         struct dmap *dp;
  965         s64 lblkno;
  966         struct metapage *mp;
  967 
  968         IREAD_LOCK(ipbmap);
  969 
  970         /*
  971          * validate extent request:
  972          *
  973          * note: defragfs policy:
  974          *  max 64 blocks will be moved.  
  975          *  allocation request size must be satisfied from a single dmap.
  976          */
  977         if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
  978                 IREAD_UNLOCK(ipbmap);
  979                 return EINVAL;
  980         }
  981 
  982         if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
  983                 /* the free space is no longer available */
  984                 IREAD_UNLOCK(ipbmap);
  985                 return ENOSPC;
  986         }
  987 
  988         /* read in the dmap covering the extent */
  989         lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
  990         mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
  991         if (mp == NULL) {
  992                 IREAD_UNLOCK(ipbmap);
  993                 return (EIO);
  994         }
  995         dp = (struct dmap *) mp->data;
  996 
  997         /* try to allocate the requested extent */
  998         rc = dbAllocNext(bmp, dp, blkno, nblocks);
  999 
 1000         IREAD_UNLOCK(ipbmap);
 1001 
 1002         if (rc == 0) {
 1003                 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
 1004                 mark_metapage_dirty(mp);
 1005         }
 1006         release_metapage(mp);
 1007 
 1008         return (rc);
 1009 }
 1010 
 1011 
 1012 /*
 1013  * NAME:        dbReAlloc()
 1014  *
 1015  * FUNCTION:    attempt to extend a current allocation by a specified
 1016  *              number of blocks.
 1017  *
 1018  *              this routine attempts to satisfy the allocation request
 1019  *              by first trying to extend the existing allocation in
 1020  *              place by allocating the additional blocks as the blocks
 1021  *              immediately following the current allocation.  if these
 1022  *              blocks are not available, this routine will attempt to
 1023  *              allocate a new set of contiguous blocks large enough
 1024  *              to cover the existing allocation plus the additional
 1025  *              number of blocks required.
 1026  *
 1027  * PARAMETERS:
 1028  *      ip          -  pointer to in-core inode requiring allocation.
 1029  *      blkno       -  starting block of the current allocation.
 1030  *      nblocks     -  number of contiguous blocks within the current
 1031  *                     allocation.
 1032  *      addnblocks  -  number of blocks to add to the allocation.
 1033  *      results -      on successful return, set to the starting block number
 1034  *                     of the existing allocation if the existing allocation
 1035  *                     was extended in place or to a newly allocated contiguous
 1036  *                     range if the existing allocation could not be extended
 1037  *                     in place.
 1038  *
 1039  * RETURN VALUES:
 1040  *      0       - success
 1041  *      ENOSPC  - insufficient disk resources
 1042  *      EIO     - i/o error
 1043  */
 1044 int
 1045 dbReAlloc(struct inode *ip,
 1046           s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
 1047 {
 1048         int rc;
 1049 
 1050         /* try to extend the allocation in place.
 1051          */
 1052         if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
 1053                 *results = blkno;
 1054                 return (0);
 1055         } else {
 1056                 if (rc != ENOSPC)
 1057                         return (rc);
 1058         }
 1059 
 1060         /* could not extend the allocation in place, so allocate a
 1061          * new set of blocks for the entire request (i.e. try to get
 1062          * a range of contiguous blocks large enough to cover the
 1063          * existing allocation plus the additional blocks.)
 1064          */
 1065         return (dbAlloc
 1066                 (ip, blkno + nblocks - 1, addnblocks + nblocks, results));
 1067 }
 1068 
 1069 
 1070 /*
 1071  * NAME:        dbExtend()
 1072  *
 1073  * FUNCTION:    attempt to extend a current allocation by a specified
 1074  *              number of blocks.
 1075  *
 1076  *              this routine attempts to satisfy the allocation request
 1077  *              by first trying to extend the existing allocation in
 1078  *              place by allocating the additional blocks as the blocks
 1079  *              immediately following the current allocation.
 1080  *
 1081  * PARAMETERS:
 1082  *      ip          -  pointer to in-core inode requiring allocation.
 1083  *      blkno       -  starting block of the current allocation.
 1084  *      nblocks     -  number of contiguous blocks within the current
 1085  *                     allocation.
 1086  *      addnblocks  -  number of blocks to add to the allocation.
 1087  *
 1088  * RETURN VALUES:
 1089  *      0       - success
 1090  *      ENOSPC  - insufficient disk resources
 1091  *      EIO     - i/o error
 1092  */
 1093 int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
 1094 {
 1095         struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 1096         s64 lblkno, lastblkno, extblkno;
 1097         uint rel_block;
 1098         struct metapage *mp;
 1099         struct dmap *dp;
 1100         int rc;
 1101         struct inode *ipbmap = sbi->ipbmap;
 1102         struct bmap *bmp;
 1103 
 1104         /*
 1105          * We don't want a non-aligned extent to cross a page boundary
 1106          */
 1107         if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
 1108             (rel_block + nblocks + addnblocks > sbi->nbperpage))
 1109                 return (ENOSPC);
 1110 
 1111         /* get the last block of the current allocation */
 1112         lastblkno = blkno + nblocks - 1;
 1113 
 1114         /* determine the block number of the block following
 1115          * the existing allocation.
 1116          */
 1117         extblkno = lastblkno + 1;
 1118 
 1119         IREAD_LOCK(ipbmap);
 1120 
 1121         /* better be within the file system */
 1122         bmp = sbi->bmap;
 1123         assert(lastblkno >= 0 && lastblkno < bmp->db_mapsize);
 1124 
 1125         /* we'll attempt to extend the current allocation in place by
 1126          * allocating the additional blocks as the blocks immediately
 1127          * following the current allocation.  we only try to extend the
 1128          * current allocation in place if the number of additional blocks
 1129          * can fit into a dmap, the last block of the current allocation
 1130          * is not the last block of the file system, and the start of the
 1131          * inplace extension is not on an allocation group boundry.
 1132          */
 1133         if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
 1134             (extblkno & (bmp->db_agsize - 1)) == 0) {
 1135                 IREAD_UNLOCK(ipbmap);
 1136                 return (ENOSPC);
 1137         }
 1138 
 1139         /* get the buffer for the dmap containing the first block
 1140          * of the extension.
 1141          */
 1142         lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
 1143         mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 1144         if (mp == NULL) {
 1145                 IREAD_UNLOCK(ipbmap);
 1146                 return (EIO);
 1147         }
 1148 
 1149         DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
 1150         dp = (struct dmap *) mp->data;
 1151 
 1152         /* try to allocate the blocks immediately following the
 1153          * current allocation.
 1154          */
 1155         rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
 1156 
 1157         IREAD_UNLOCK(ipbmap);
 1158 
 1159         /* were we successful ? */
 1160         if (rc == 0) {
 1161                 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno,
 1162                         addnblocks);
 1163                 write_metapage(mp);
 1164         } else {
 1165                 /* we were not successful */
 1166                 release_metapage(mp);
 1167                 assert(rc == ENOSPC || rc == EIO);
 1168         }
 1169 
 1170         return (rc);
 1171 }
 1172 
 1173 
 1174 /*
 1175  * NAME:        dbAllocNext()
 1176  *
 1177  * FUNCTION:    attempt to allocate the blocks of the specified block
 1178  *              range within a dmap.
 1179  *
 1180  * PARAMETERS:
 1181  *      bmp     -  pointer to bmap descriptor
 1182  *      dp      -  pointer to dmap.
 1183  *      blkno   -  starting block number of the range.
 1184  *      nblocks -  number of contiguous free blocks of the range.
 1185  *
 1186  * RETURN VALUES:
 1187  *      0       - success
 1188  *      ENOSPC  - insufficient disk resources
 1189  *      EIO     - i/o error
 1190  *
 1191  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
 1192  */
 1193 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
 1194                        int nblocks)
 1195 {
 1196         int dbitno, word, rembits, nb, nwords, wbitno, nw;
 1197         int l2size;
 1198         s8 *leaf;
 1199         u32 mask;
 1200 
 1201         /* pick up a pointer to the leaves of the dmap tree.
 1202          */
 1203         leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
 1204 
 1205         /* determine the bit number and word within the dmap of the
 1206          * starting block.
 1207          */
 1208         dbitno = blkno & (BPERDMAP - 1);
 1209         word = dbitno >> L2DBWORD;
 1210 
 1211         /* check if the specified block range is contained within
 1212          * this dmap.
 1213          */
 1214         if (dbitno + nblocks > BPERDMAP)
 1215                 return (ENOSPC);
 1216 
 1217         /* check if the starting leaf indicates that anything
 1218          * is free.
 1219          */
 1220         if (leaf[word] == NOFREE)
 1221                 return (ENOSPC);
 1222 
 1223         /* check the dmaps words corresponding to block range to see
 1224          * if the block range is free.  not all bits of the first and
 1225          * last words may be contained within the block range.  if this
 1226          * is the case, we'll work against those words (i.e. partial first
 1227          * and/or last) on an individual basis (a single pass) and examine
 1228          * the actual bits to determine if they are free.  a single pass
 1229          * will be used for all dmap words fully contained within the
 1230          * specified range.  within this pass, the leaves of the dmap
 1231          * tree will be examined to determine if the blocks are free. a
 1232          * single leaf may describe the free space of multiple dmap
 1233          * words, so we may visit only a subset of the actual leaves
 1234          * corresponding to the dmap words of the block range.
 1235          */
 1236         for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 1237                 /* determine the bit number within the word and
 1238                  * the number of bits within the word.
 1239                  */
 1240                 wbitno = dbitno & (DBWORD - 1);
 1241                 nb = min(rembits, DBWORD - wbitno);
 1242 
 1243                 /* check if only part of the word is to be examined.
 1244                  */
 1245                 if (nb < DBWORD) {
 1246                         /* check if the bits are free.
 1247                          */
 1248                         mask = (ONES << (DBWORD - nb) >> wbitno);
 1249                         if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
 1250                                 return (ENOSPC);
 1251 
 1252                         word += 1;
 1253                 } else {
 1254                         /* one or more dmap words are fully contained
 1255                          * within the block range.  determine how many
 1256                          * words and how many bits.
 1257                          */
 1258                         nwords = rembits >> L2DBWORD;
 1259                         nb = nwords << L2DBWORD;
 1260 
 1261                         /* now examine the appropriate leaves to determine
 1262                          * if the blocks are free.
 1263                          */
 1264                         while (nwords > 0) {
 1265                                 /* does the leaf describe any free space ?
 1266                                  */
 1267                                 if (leaf[word] < BUDMIN)
 1268                                         return (ENOSPC);
 1269 
 1270                                 /* determine the l2 number of bits provided
 1271                                  * by this leaf.
 1272                                  */
 1273                                 l2size =
 1274                                     min((int)leaf[word], NLSTOL2BSZ(nwords));
 1275 
 1276                                 /* determine how many words were handled.
 1277                                  */
 1278                                 nw = BUDSIZE(l2size, BUDMIN);
 1279 
 1280                                 nwords -= nw;
 1281                                 word += nw;
 1282                         }
 1283                 }
 1284         }
 1285 
 1286         /* allocate the blocks.
 1287          */
 1288         return (dbAllocDmap(bmp, dp, blkno, nblocks));
 1289 }
 1290 
 1291 
 1292 /*
 1293  * NAME:        dbAllocNear()
 1294  *
 1295  * FUNCTION:    attempt to allocate a number of contiguous free blocks near
 1296  *              a specified block (hint) within a dmap.
 1297  *
 1298  *              starting with the dmap leaf that covers the hint, we'll
 1299  *              check the next four contiguous leaves for sufficient free
 1300  *              space.  if sufficient free space is found, we'll allocate
 1301  *              the desired free space.
 1302  *
 1303  * PARAMETERS:
 1304  *      bmp     -  pointer to bmap descriptor
 1305  *      dp      -  pointer to dmap.
 1306  *      blkno   -  block number to allocate near.
 1307  *      nblocks -  actual number of contiguous free blocks desired.
 1308  *      l2nb    -  log2 number of contiguous free blocks desired.
 1309  *      results -  on successful return, set to the starting block number
 1310  *                 of the newly allocated range.
 1311  *
 1312  * RETURN VALUES:
 1313  *      0       - success
 1314  *      ENOSPC  - insufficient disk resources
 1315  *      EIO     - i/o error
 1316  *
 1317  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
 1318  */
 1319 static int
 1320 dbAllocNear(struct bmap * bmp,
 1321             struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
 1322 {
 1323         int word, lword, rc;
 1324         s8 *leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
 1325 
 1326         /* determine the word within the dmap that holds the hint
 1327          * (i.e. blkno).  also, determine the last word in the dmap
 1328          * that we'll include in our examination.
 1329          */
 1330         word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
 1331         lword = min(word + 4, LPERDMAP);
 1332 
 1333         /* examine the leaves for sufficient free space.
 1334          */
 1335         for (; word < lword; word++) {
 1336                 /* does the leaf describe sufficient free space ?
 1337                  */
 1338                 if (leaf[word] < l2nb)
 1339                         continue;
 1340 
 1341                 /* determine the block number within the file system
 1342                  * of the first block described by this dmap word.
 1343                  */
 1344                 blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
 1345 
 1346                 /* if not all bits of the dmap word are free, get the
 1347                  * starting bit number within the dmap word of the required
 1348                  * string of free bits and adjust the block number with the
 1349                  * value.
 1350                  */
 1351                 if (leaf[word] < BUDMIN)
 1352                         blkno +=
 1353                             dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
 1354 
 1355                 /* allocate the blocks.
 1356                  */
 1357                 if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
 1358                         *results = blkno;
 1359 
 1360                 return (rc);
 1361         }
 1362 
 1363         return (ENOSPC);
 1364 }
 1365 
 1366 
 1367 /*
 1368  * NAME:        dbAllocAG()
 1369  *
 1370  * FUNCTION:    attempt to allocate the specified number of contiguous
 1371  *              free blocks within the specified allocation group.
 1372  *
 1373  *              unless the allocation group size is equal to the number
 1374  *              of blocks per dmap, the dmap control pages will be used to
 1375  *              find the required free space, if available.  we start the
 1376  *              search at the highest dmap control page level which
 1377  *              distinctly describes the allocation group's free space
 1378  *              (i.e. the highest level at which the allocation group's
 1379  *              free space is not mixed in with that of any other group).
 1380  *              in addition, we start the search within this level at a
 1381  *              height of the dmapctl dmtree at which the nodes distinctly
 1382  *              describe the allocation group's free space.  at this height,
 1383  *              the allocation group's free space may be represented by 1
 1384  *              or two sub-trees, depending on the allocation group size.
 1385  *              we search the top nodes of these subtrees left to right for
 1386  *              sufficient free space.  if sufficient free space is found,
 1387  *              the subtree is searched to find the leftmost leaf that 
 1388  *              has free space.  once we have made it to the leaf, we
 1389  *              move the search to the next lower level dmap control page
 1390  *              corresponding to this leaf.  we continue down the dmap control
 1391  *              pages until we find the dmap that contains or starts the
 1392  *              sufficient free space and we allocate at this dmap.
 1393  *
 1394  *              if the allocation group size is equal to the dmap size,
 1395  *              we'll start at the dmap corresponding to the allocation
 1396  *              group and attempt the allocation at this level.
 1397  *
 1398  *              the dmap control page search is also not performed if the
 1399  *              allocation group is completely free and we go to the first
 1400  *              dmap of the allocation group to do the allocation.  this is
 1401  *              done because the allocation group may be part (not the first
 1402  *              part) of a larger binary buddy system, causing the dmap
 1403  *              control pages to indicate no free space (NOFREE) within
 1404  *              the allocation group.
 1405  *
 1406  * PARAMETERS:
 1407  *      bmp     -  pointer to bmap descriptor
 1408  *      agno    - allocation group number.
 1409  *      nblocks -  actual number of contiguous free blocks desired.
 1410  *      l2nb    -  log2 number of contiguous free blocks desired.
 1411  *      results -  on successful return, set to the starting block number
 1412  *                 of the newly allocated range.
 1413  *
 1414  * RETURN VALUES:
 1415  *      0       - success
 1416  *      ENOSPC  - insufficient disk resources
 1417  *      EIO     - i/o error
 1418  *
 1419  * note: IWRITE_LOCK(ipmap) held on entry/exit;
 1420  */
 1421 static int
 1422 dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 1423 {
 1424         struct metapage *mp;
 1425         struct dmapctl *dcp;
 1426         int rc, ti, i, k, m, n, agperlev;
 1427         s64 blkno, lblkno;
 1428         int budmin;
 1429 
 1430         /* allocation request should not be for more than the
 1431          * allocation group size.
 1432          */
 1433         assert(l2nb <= bmp->db_agl2size);
 1434 
 1435         /* determine the starting block number of the allocation
 1436          * group.
 1437          */
 1438         blkno = (s64) agno << bmp->db_agl2size;
 1439 
 1440         /* check if the allocation group size is the minimum allocation
 1441          * group size or if the allocation group is completely free. if
 1442          * the allocation group size is the minimum size of BPERDMAP (i.e.
 1443          * 1 dmap), there is no need to search the dmap control page (below)
 1444          * that fully describes the allocation group since the allocation
 1445          * group is already fully described by a dmap.  in this case, we
 1446          * just call dbAllocCtl() to search the dmap tree and allocate the
 1447          * required space if available.  
 1448          *
 1449          * if the allocation group is completely free, dbAllocCtl() is
 1450          * also called to allocate the required space.  this is done for
 1451          * two reasons.  first, it makes no sense searching the dmap control
 1452          * pages for free space when we know that free space exists.  second,
 1453          * the dmap control pages may indicate that the allocation group
 1454          * has no free space if the allocation group is part (not the first
 1455          * part) of a larger binary buddy system.
 1456          */
 1457         if (bmp->db_agsize == BPERDMAP
 1458             || bmp->db_agfree[agno] == bmp->db_agsize) {
 1459                 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
 1460                 /* assert(!(rc == ENOSPC && bmp->db_agfree[agno] == bmp->db_agsize)); */
 1461                 if ((rc == ENOSPC) &&
 1462                     (bmp->db_agfree[agno] == bmp->db_agsize)) {
 1463                         jfs_err("dbAllocAG: removed assert, but still need to "
 1464                                 "debug here\nblkno = 0x%Lx, nblocks = 0x%Lx",
 1465                                 (unsigned long long) blkno,
 1466                                 (unsigned long long) nblocks);
 1467                 }
 1468                 return (rc);
 1469         }
 1470 
 1471         /* the buffer for the dmap control page that fully describes the
 1472          * allocation group.
 1473          */
 1474         lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
 1475         mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 1476         if (mp == NULL)
 1477                 return (EIO);
 1478         dcp = (struct dmapctl *) mp->data;
 1479         budmin = dcp->budmin;
 1480 
 1481         /* search the subtree(s) of the dmap control page that describes
 1482          * the allocation group, looking for sufficient free space.  to begin,
 1483          * determine how many allocation groups are represented in a dmap
 1484          * control page at the control page level (i.e. L0, L1, L2) that
 1485          * fully describes an allocation group. next, determine the starting
 1486          * tree index of this allocation group within the control page.
 1487          */
 1488         agperlev =
 1489             (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
 1490         ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
 1491 
 1492         /* dmap control page trees fan-out by 4 and a single allocation 
 1493          * group may be described by 1 or 2 subtrees within the ag level
 1494          * dmap control page, depending upon the ag size. examine the ag's
 1495          * subtrees for sufficient free space, starting with the leftmost
 1496          * subtree.
 1497          */
 1498         for (i = 0; i < bmp->db_agwidth; i++, ti++) {
 1499                 /* is there sufficient free space ?
 1500                  */
 1501                 if (l2nb > dcp->stree[ti])
 1502                         continue;
 1503 
 1504                 /* sufficient free space found in a subtree. now search down
 1505                  * the subtree to find the leftmost leaf that describes this
 1506                  * free space.
 1507                  */
 1508                 for (k = bmp->db_agheigth; k > 0; k--) {
 1509                         for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
 1510                                 if (l2nb <= dcp->stree[m + n]) {
 1511                                         ti = m + n;
 1512                                         break;
 1513                                 }
 1514                         }
 1515                         assert(n < 4);
 1516                 }
 1517 
 1518                 /* determine the block number within the file system
 1519                  * that corresponds to this leaf.
 1520                  */
 1521                 if (bmp->db_aglevel == 2)
 1522                         blkno = 0;
 1523                 else if (bmp->db_aglevel == 1)
 1524                         blkno &= ~(MAXL1SIZE - 1);
 1525                 else            /* bmp->db_aglevel == 0 */
 1526                         blkno &= ~(MAXL0SIZE - 1);
 1527 
 1528                 blkno +=
 1529                     ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
 1530 
 1531                 /* release the buffer in preparation for going down
 1532                  * the next level of dmap control pages.
 1533                  */
 1534                 release_metapage(mp);
 1535 
 1536                 /* check if we need to continue to search down the lower
 1537                  * level dmap control pages.  we need to if the number of
 1538                  * blocks required is less than maximum number of blocks
 1539                  * described at the next lower level.
 1540                  */
 1541                 if (l2nb < budmin) {
 1542 
 1543                         /* search the lower level dmap control pages to get
 1544                          * the starting block number of the the dmap that
 1545                          * contains or starts off the free space.
 1546                          */
 1547                         if ((rc =
 1548                              dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
 1549                                        &blkno))) {
 1550                                 assert(rc != ENOSPC);
 1551                                 return (rc);
 1552                         }
 1553                 }
 1554 
 1555                 /* allocate the blocks.
 1556                  */
 1557                 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
 1558                 assert(rc != ENOSPC);
 1559                 return (rc);
 1560         }
 1561 
 1562         /* no space in the allocation group.  release the buffer and
 1563          * return ENOSPC.
 1564          */
 1565         release_metapage(mp);
 1566 
 1567         return (ENOSPC);
 1568 }
 1569 
 1570 
 1571 /*
 1572  * NAME:        dbAllocAny()
 1573  *
 1574  * FUNCTION:    attempt to allocate the specified number of contiguous
 1575  *              free blocks anywhere in the file system.
 1576  *
 1577  *              dbAllocAny() attempts to find the sufficient free space by
 1578  *              searching down the dmap control pages, starting with the
 1579  *              highest level (i.e. L0, L1, L2) control page.  if free space
 1580  *              large enough to satisfy the desired free space is found, the
 1581  *              desired free space is allocated.
 1582  *
 1583  * PARAMETERS:
 1584  *      bmp     -  pointer to bmap descriptor
 1585  *      nblocks  -  actual number of contiguous free blocks desired.
 1586  *      l2nb     -  log2 number of contiguous free blocks desired.
 1587  *      results -  on successful return, set to the starting block number
 1588  *                 of the newly allocated range.
 1589  *
 1590  * RETURN VALUES:
 1591  *      0       - success
 1592  *      ENOSPC  - insufficient disk resources
 1593  *      EIO     - i/o error
 1594  *
 1595  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
 1596  */
 1597 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
 1598 {
 1599         int rc;
 1600         s64 blkno = 0;
 1601 
 1602         /* starting with the top level dmap control page, search
 1603          * down the dmap control levels for sufficient free space.
 1604          * if free space is found, dbFindCtl() returns the starting
 1605          * block number of the dmap that contains or starts off the
 1606          * range of free space.
 1607          */
 1608         if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
 1609                 return (rc);
 1610 
 1611         /* allocate the blocks.
 1612          */
 1613         rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
 1614         assert(rc != ENOSPC);
 1615         return (rc);
 1616 }
 1617 
 1618 
 1619 /*
 1620  * NAME:        dbFindCtl()
 1621  *
 1622  * FUNCTION:    starting at a specified dmap control page level and block
 1623  *              number, search down the dmap control levels for a range of
 1624  *              contiguous free blocks large enough to satisfy an allocation
 1625  *              request for the specified number of free blocks.
 1626  *
 1627  *              if sufficient contiguous free blocks are found, this routine
 1628  *              returns the starting block number within a dmap page that
 1629  *              contains or starts a range of contiqious free blocks that
 1630  *              is sufficient in size.
 1631  *
 1632  * PARAMETERS:
 1633  *      bmp     -  pointer to bmap descriptor
 1634  *      level   -  starting dmap control page level.
 1635  *      l2nb    -  log2 number of contiguous free blocks desired.
 1636  *      *blkno  -  on entry, starting block number for conducting the search.
 1637  *                 on successful return, the first block within a dmap page
 1638  *                 that contains or starts a range of contiguous free blocks.
 1639  *
 1640  * RETURN VALUES:
 1641  *      0       - success
 1642  *      ENOSPC  - insufficient disk resources
 1643  *      EIO     - i/o error
 1644  *
 1645  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
 1646  */
 1647 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 1648 {
 1649         int rc, leafidx, lev;
 1650         s64 b, lblkno;
 1651         struct dmapctl *dcp;
 1652         int budmin;
 1653         struct metapage *mp;
 1654 
 1655         /* starting at the specified dmap control page level and block
 1656          * number, search down the dmap control levels for the starting
 1657          * block number of a dmap page that contains or starts off 
 1658          * sufficient free blocks.
 1659          */
 1660         for (lev = level, b = *blkno; lev >= 0; lev--) {
 1661                 /* get the buffer of the dmap control page for the block
 1662                  * number and level (i.e. L0, L1, L2).
 1663                  */
 1664                 lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
 1665                 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 1666                 if (mp == NULL)
 1667                         return (EIO);
 1668                 dcp = (struct dmapctl *) mp->data;
 1669                 budmin = dcp->budmin;
 1670 
 1671                 /* search the tree within the dmap control page for
 1672                  * sufficent free space.  if sufficient free space is found,
 1673                  * dbFindLeaf() returns the index of the leaf at which
 1674                  * free space was found.
 1675                  */
 1676                 rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
 1677 
 1678                 /* release the buffer.
 1679                  */
 1680                 release_metapage(mp);
 1681 
 1682                 /* space found ?
 1683                  */
 1684                 if (rc) {
 1685                         assert(lev == level);
 1686                         return (ENOSPC);
 1687                 }
 1688 
 1689                 /* adjust the block number to reflect the location within
 1690                  * the dmap control page (i.e. the leaf) at which free 
 1691                  * space was found.
 1692                  */
 1693                 b += (((s64) leafidx) << budmin);
 1694 
 1695                 /* we stop the search at this dmap control page level if
 1696                  * the number of blocks required is greater than or equal
 1697                  * to the maximum number of blocks described at the next
 1698                  * (lower) level.
 1699                  */
 1700                 if (l2nb >= budmin)
 1701                         break;
 1702         }
 1703 
 1704         *blkno = b;
 1705         return (0);
 1706 }
 1707 
 1708 
 1709 /*
 1710  * NAME:        dbAllocCtl()
 1711  *
 1712  * FUNCTION:    attempt to allocate a specified number of contiguous
 1713  *              blocks starting within a specific dmap.  
 1714  *              
 1715  *              this routine is called by higher level routines that search
 1716  *              the dmap control pages above the actual dmaps for contiguous
 1717  *              free space.  the result of successful searches by these
 1718  *              routines are the starting block numbers within dmaps, with
 1719  *              the dmaps themselves containing the desired contiguous free
 1720  *              space or starting a contiguous free space of desired size
 1721  *              that is made up of the blocks of one or more dmaps. these
 1722  *              calls should not fail due to insufficent resources.
 1723  *
 1724  *              this routine is called in some cases where it is not known
 1725  *              whether it will fail due to insufficient resources.  more
 1726  *              specifically, this occurs when allocating from an allocation
 1727  *              group whose size is equal to the number of blocks per dmap.
 1728  *              in this case, the dmap control pages are not examined prior
 1729  *              to calling this routine (to save pathlength) and the call
 1730  *              might fail.
 1731  *
 1732  *              for a request size that fits within a dmap, this routine relies
 1733  *              upon the dmap's dmtree to find the requested contiguous free
 1734  *              space.  for request sizes that are larger than a dmap, the
 1735  *              requested free space will start at the first block of the
 1736  *              first dmap (i.e. blkno).
 1737  *
 1738  * PARAMETERS:
 1739  *      bmp     -  pointer to bmap descriptor
 1740  *      nblocks  -  actual number of contiguous free blocks to allocate.
 1741  *      l2nb     -  log2 number of contiguous free blocks to allocate.
 1742  *      blkno    -  starting block number of the dmap to start the allocation
 1743  *                  from.
 1744  *      results -  on successful return, set to the starting block number
 1745  *                 of the newly allocated range.
 1746  *
 1747  * RETURN VALUES:
 1748  *      0       - success
 1749  *      ENOSPC  - insufficient disk resources
 1750  *      EIO     - i/o error
 1751  *
 1752  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
 1753  */
 1754 static int
 1755 dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
 1756 {
 1757         int rc, nb;
 1758         s64 b, lblkno, n;
 1759         struct metapage *mp;
 1760         struct dmap *dp;
 1761 
 1762         /* check if the allocation request is confined to a single dmap.
 1763          */
 1764         if (l2nb <= L2BPERDMAP) {
 1765                 /* get the buffer for the dmap.
 1766                  */
 1767                 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 1768                 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 1769                 if (mp == NULL)
 1770                         return (EIO);
 1771                 dp = (struct dmap *) mp->data;
 1772 
 1773                 /* try to allocate the blocks.
 1774                  */
 1775                 rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
 1776                 if (rc == 0)
 1777                         mark_metapage_dirty(mp);
 1778 
 1779                 release_metapage(mp);
 1780 
 1781                 return (rc);
 1782         }
 1783 
 1784         /* allocation request involving multiple dmaps. it must start on
 1785          * a dmap boundary.
 1786          */
 1787         assert((blkno & (BPERDMAP - 1)) == 0);
 1788 
 1789         /* allocate the blocks dmap by dmap.
 1790          */
 1791         for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
 1792                 /* get the buffer for the dmap.
 1793                  */
 1794                 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
 1795                 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 1796                 if (mp == NULL) {
 1797                         rc = EIO;
 1798                         goto backout;
 1799                 }
 1800                 dp = (struct dmap *) mp->data;
 1801 
 1802                 /* the dmap better be all free.
 1803                  */
 1804                 assert(dp->tree.stree[ROOT] == L2BPERDMAP);
 1805 
 1806                 /* determine how many blocks to allocate from this dmap.
 1807                  */
 1808                 nb = min(n, (s64)BPERDMAP);
 1809 
 1810                 /* allocate the blocks from the dmap.
 1811                  */
 1812                 if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
 1813                         release_metapage(mp);
 1814                         goto backout;
 1815                 }
 1816 
 1817                 /* write the buffer.
 1818                  */
 1819                 write_metapage(mp);
 1820         }
 1821 
 1822         /* set the results (starting block number) and return.
 1823          */
 1824         *results = blkno;
 1825         return (0);
 1826 
 1827         /* something failed in handling an allocation request involving
 1828          * multiple dmaps.  we'll try to clean up by backing out any
 1829          * allocation that has already happened for this request.  if
 1830          * we fail in backing out the allocation, we'll mark the file
 1831          * system to indicate that blocks have been leaked.
 1832          */
 1833       backout:
 1834 
 1835         /* try to backout the allocations dmap by dmap.
 1836          */
 1837         for (n = nblocks - n, b = blkno; n > 0;
 1838              n -= BPERDMAP, b += BPERDMAP) {
 1839                 /* get the buffer for this dmap.
 1840                  */
 1841                 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
 1842                 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 1843                 if (mp == NULL) {
 1844                         /* could not back out.  mark the file system
 1845                          * to indicate that we have leaked blocks.
 1846                          */
 1847                         fsDirty();      /* !!! */
 1848                         jfs_err("dbAllocCtl: I/O Error: Block Leakage.");
 1849                         continue;
 1850                 }
 1851                 dp = (struct dmap *) mp->data;
 1852 
 1853                 /* free the blocks is this dmap.
 1854                  */
 1855                 if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
 1856                         /* could not back out.  mark the file system
 1857                          * to indicate that we have leaked blocks.
 1858                          */
 1859                         release_metapage(mp);
 1860                         fsDirty();      /* !!! */
 1861                         jfs_err("dbAllocCtl: Block Leakage.");
 1862                         continue;
 1863                 }
 1864 
 1865                 /* write the buffer.
 1866                  */
 1867                 write_metapage(mp);
 1868         }
 1869 
 1870         return (rc);
 1871 }
 1872 
 1873 
 1874 /*
 1875  * NAME:        dbAllocDmapLev()
 1876  *
 1877  * FUNCTION:    attempt to allocate a specified number of contiguous blocks
 1878  *              from a specified dmap.
 1879  *              
 1880  *              this routine checks if the contiguous blocks are available.
 1881  *              if so, nblocks of blocks are allocated; otherwise, ENOSPC is
 1882  *              returned.
 1883  *
 1884  * PARAMETERS:
 1885  *      mp      -  pointer to bmap descriptor
 1886  *      dp      -  pointer to dmap to attempt to allocate blocks from. 
 1887  *      l2nb    -  log2 number of contiguous block desired.
 1888  *      nblocks -  actual number of contiguous block desired.
 1889  *      results -  on successful return, set to the starting block number
 1890  *                 of the newly allocated range.
 1891  *
 1892  * RETURN VALUES:
 1893  *      0       - success
 1894  *      ENOSPC  - insufficient disk resources
 1895  *      EIO     - i/o error
 1896  *
 1897  * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or 
 1898  *      IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
 1899  */
 1900 static int
 1901 dbAllocDmapLev(struct bmap * bmp,
 1902                struct dmap * dp, int nblocks, int l2nb, s64 * results)
 1903 {
 1904         s64 blkno;
 1905         int leafidx, rc;
 1906 
 1907         /* can't be more than a dmaps worth of blocks */
 1908         assert(l2nb <= L2BPERDMAP);
 1909 
 1910         /* search the tree within the dmap page for sufficient
 1911          * free space.  if sufficient free space is found, dbFindLeaf()
 1912          * returns the index of the leaf at which free space was found.
 1913          */
 1914         if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
 1915                 return (ENOSPC);
 1916 
 1917         /* determine the block number within the file system corresponding
 1918          * to the leaf at which free space was found.
 1919          */
 1920         blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
 1921 
 1922         /* if not all bits of the dmap word are free, get the starting
 1923          * bit number within the dmap word of the required string of free
 1924          * bits and adjust the block number with this value.
 1925          */
 1926         if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
 1927                 blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
 1928 
 1929         /* allocate the blocks */
 1930         if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
 1931                 *results = blkno;
 1932 
 1933         return (rc);
 1934 }
 1935 
 1936 
 1937 /*
 1938  * NAME:        dbAllocDmap()
 1939  *
 1940  * FUNCTION:    adjust the disk allocation map to reflect the allocation
 1941  *              of a specified block range within a dmap.
 1942  *
 1943  *              this routine allocates the specified blocks from the dmap
 1944  *              through a call to dbAllocBits(). if the allocation of the
 1945  *              block range causes the maximum string of free blocks within
 1946  *              the dmap to change (i.e. the value of the root of the dmap's
 1947  *              dmtree), this routine will cause this change to be reflected
 1948  *              up through the appropriate levels of the dmap control pages
 1949  *              by a call to dbAdjCtl() for the L0 dmap control page that
 1950  *              covers this dmap.
 1951  *
 1952  * PARAMETERS:
 1953  *      bmp     -  pointer to bmap descriptor
 1954  *      dp      -  pointer to dmap to allocate the block range from.
 1955  *      blkno   -  starting block number of the block to be allocated.
 1956  *      nblocks -  number of blocks to be allocated.
 1957  *
 1958  * RETURN VALUES:
 1959  *      0       - success
 1960  *      EIO     - i/o error
 1961  *
 1962  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 1963  */
 1964 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 1965                        int nblocks)
 1966 {
 1967         s8 oldroot;
 1968         int rc;
 1969 
 1970         /* save the current value of the root (i.e. maximum free string)
 1971          * of the dmap tree.
 1972          */
 1973         oldroot = dp->tree.stree[ROOT];
 1974 
 1975         /* allocate the specified (blocks) bits */
 1976         dbAllocBits(bmp, dp, blkno, nblocks);
 1977 
 1978         /* if the root has not changed, done. */
 1979         if (dp->tree.stree[ROOT] == oldroot)
 1980                 return (0);
 1981 
 1982         /* root changed. bubble the change up to the dmap control pages.
 1983          * if the adjustment of the upper level control pages fails,
 1984          * backout the bit allocation (thus making everything consistent).
 1985          */
 1986         if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
 1987                 dbFreeBits(bmp, dp, blkno, nblocks);
 1988 
 1989         return (rc);
 1990 }
 1991 
 1992 
 1993 /*
 1994  * NAME:        dbFreeDmap()
 1995  *
 1996  * FUNCTION:    adjust the disk allocation map to reflect the allocation
 1997  *              of a specified block range within a dmap.
 1998  *
 1999  *              this routine frees the specified blocks from the dmap through
 2000  *              a call to dbFreeBits(). if the deallocation of the block range
 2001  *              causes the maximum string of free blocks within the dmap to
 2002  *              change (i.e. the value of the root of the dmap's dmtree), this
 2003  *              routine will cause this change to be reflected up through the
 2004  *              appropriate levels of the dmap control pages by a call to
 2005  *              dbAdjCtl() for the L0 dmap control page that covers this dmap.
 2006  *
 2007  * PARAMETERS:
 2008  *      bmp     -  pointer to bmap descriptor
 2009  *      dp      -  pointer to dmap to free the block range from.
 2010  *      blkno   -  starting block number of the block to be freed.
 2011  *      nblocks -  number of blocks to be freed.
 2012  *
 2013  * RETURN VALUES:
 2014  *      0       - success
 2015  *      EIO     - i/o error
 2016  *
 2017  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 2018  */
 2019 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 2020                       int nblocks)
 2021 {
 2022         s8 oldroot;
 2023         int rc, word;
 2024 
 2025         /* save the current value of the root (i.e. maximum free string)
 2026          * of the dmap tree.
 2027          */
 2028         oldroot = dp->tree.stree[ROOT];
 2029 
 2030         /* free the specified (blocks) bits */
 2031         dbFreeBits(bmp, dp, blkno, nblocks);
 2032 
 2033         /* if the root has not changed, done. */
 2034         if (dp->tree.stree[ROOT] == oldroot)
 2035                 return (0);
 2036 
 2037         /* root changed. bubble the change up to the dmap control pages.
 2038          * if the adjustment of the upper level control pages fails,
 2039          * backout the deallocation. 
 2040          */
 2041         if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
 2042                 word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
 2043 
 2044                 /* as part of backing out the deallocation, we will have
 2045                  * to back split the dmap tree if the deallocation caused
 2046                  * the freed blocks to become part of a larger binary buddy
 2047                  * system.
 2048                  */
 2049                 if (dp->tree.stree[word] == NOFREE)
 2050                         dbBackSplit((dmtree_t *) & dp->tree, word);
 2051 
 2052                 dbAllocBits(bmp, dp, blkno, nblocks);
 2053         }
 2054 
 2055         return (rc);
 2056 }
 2057 
 2058 
 2059 /*
 2060  * NAME:        dbAllocBits()
 2061  *
 2062  * FUNCTION:    allocate a specified block range from a dmap.
 2063  *
 2064  *              this routine updates the dmap to reflect the working
 2065  *              state allocation of the specified block range. it directly
 2066  *              updates the bits of the working map and causes the adjustment
 2067  *              of the binary buddy system described by the dmap's dmtree
 2068  *              leaves to reflect the bits allocated.  it also causes the
 2069  *              dmap's dmtree, as a whole, to reflect the allocated range.
 2070  *
 2071  * PARAMETERS:
 2072  *      bmp     -  pointer to bmap descriptor
 2073  *      dp      -  pointer to dmap to allocate bits from.
 2074  *      blkno   -  starting block number of the bits to be allocated.
 2075  *      nblocks -  number of bits to be allocated.
 2076  *
 2077  * RETURN VALUES: none
 2078  *
 2079  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 2080  */
 2081 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 2082                         int nblocks)
 2083 {
 2084         int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
 2085         dmtree_t *tp = (dmtree_t *) & dp->tree;
 2086         int size;
 2087         s8 *leaf;
 2088 
 2089         /* pick up a pointer to the leaves of the dmap tree */
 2090         leaf = dp->tree.stree + LEAFIND;
 2091 
 2092         /* determine the bit number and word within the dmap of the
 2093          * starting block.
 2094          */
 2095         dbitno = blkno & (BPERDMAP - 1);
 2096         word = dbitno >> L2DBWORD;
 2097 
 2098         /* block range better be within the dmap */
 2099         assert(dbitno + nblocks <= BPERDMAP);
 2100 
 2101         /* allocate the bits of the dmap's words corresponding to the block
 2102          * range. not all bits of the first and last words may be contained
 2103          * within the block range.  if this is the case, we'll work against
 2104          * those words (i.e. partial first and/or last) on an individual basis
 2105          * (a single pass), allocating the bits of interest by hand and
 2106          * updating the leaf corresponding to the dmap word. a single pass
 2107          * will be used for all dmap words fully contained within the
 2108          * specified range.  within this pass, the bits of all fully contained
 2109          * dmap words will be marked as free in a single shot and the leaves
 2110          * will be updated. a single leaf may describe the free space of
 2111          * multiple dmap words, so we may update only a subset of the actual
 2112          * leaves corresponding to the dmap words of the block range.
 2113          */
 2114         for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 2115                 /* determine the bit number within the word and
 2116                  * the number of bits within the word.
 2117                  */
 2118                 wbitno = dbitno & (DBWORD - 1);
 2119                 nb = min(rembits, DBWORD - wbitno);
 2120 
 2121                 /* check if only part of a word is to be allocated.
 2122                  */
 2123                 if (nb < DBWORD) {
 2124                         /* allocate (set to 1) the appropriate bits within
 2125                          * this dmap word.
 2126                          */
 2127                         dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
 2128                                                       >> wbitno);
 2129 
 2130                         /* update the leaf for this dmap word. in addition
 2131                          * to setting the leaf value to the binary buddy max
 2132                          * of the updated dmap word, dbSplit() will split
 2133                          * the binary system of the leaves if need be.
 2134                          */
 2135                         dbSplit(tp, word, BUDMIN,
 2136                                 dbMaxBud((u8 *) & dp->wmap[word]));
 2137 
 2138                         word += 1;
 2139                 } else {
 2140                         /* one or more dmap words are fully contained
 2141                          * within the block range.  determine how many
 2142                          * words and allocate (set to 1) the bits of these
 2143                          * words.
 2144                          */
 2145                         nwords = rembits >> L2DBWORD;
 2146                         memset(&dp->wmap[word], (int) ONES, nwords * 4);
 2147 
 2148                         /* determine how many bits.
 2149                          */
 2150                         nb = nwords << L2DBWORD;
 2151 
 2152                         /* now update the appropriate leaves to reflect
 2153                          * the allocated words.
 2154                          */
 2155                         for (; nwords > 0; nwords -= nw) {
 2156                                 assert(leaf[word] >= BUDMIN);
 2157 
 2158                                 /* determine what the leaf value should be
 2159                                  * updated to as the minimum of the l2 number
 2160                                  * of bits being allocated and the l2 number
 2161                                  * of bits currently described by this leaf.
 2162                                  */
 2163                                 size = min((int)leaf[word], NLSTOL2BSZ(nwords));
 2164 
 2165                                 /* update the leaf to reflect the allocation.
 2166                                  * in addition to setting the leaf value to
 2167                                  * NOFREE, dbSplit() will split the binary
 2168                                  * system of the leaves to reflect the current
 2169                                  * allocation (size).
 2170                                  */
 2171                                 dbSplit(tp, word, size, NOFREE);
 2172 
 2173                                 /* get the number of dmap words handled */
 2174                                 nw = BUDSIZE(size, BUDMIN);
 2175                                 word += nw;
 2176                         }
 2177                 }
 2178         }
 2179 
 2180         /* update the free count for this dmap */
 2181         dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
 2182 
 2183         BMAP_LOCK(bmp);
 2184 
 2185         /* if this allocation group is completely free,
 2186          * update the maximum allocation group number if this allocation
 2187          * group is the new max.
 2188          */
 2189         agno = blkno >> bmp->db_agl2size;
 2190         if (agno > bmp->db_maxag)
 2191                 bmp->db_maxag = agno;
 2192 
 2193         /* update the free count for the allocation group and map */
 2194         bmp->db_agfree[agno] -= nblocks;
 2195         bmp->db_nfree -= nblocks;
 2196 
 2197         BMAP_UNLOCK(bmp);
 2198 }
 2199 
 2200 
 2201 /*
 2202  * NAME:        dbFreeBits()
 2203  *
 2204  * FUNCTION:    free a specified block range from a dmap.
 2205  *
 2206  *              this routine updates the dmap to reflect the working
 2207  *              state allocation of the specified block range. it directly
 2208  *              updates the bits of the working map and causes the adjustment
 2209  *              of the binary buddy system described by the dmap's dmtree
 2210  *              leaves to reflect the bits freed.  it also causes the dmap's
 2211  *              dmtree, as a whole, to reflect the deallocated range.
 2212  *
 2213  * PARAMETERS:
 2214  *      bmp     -  pointer to bmap descriptor
 2215  *      dp      -  pointer to dmap to free bits from.
 2216  *      blkno   -  starting block number of the bits to be freed.
 2217  *      nblocks -  number of bits to be freed.
 2218  *
 2219  * RETURN VALUES: none
 2220  *
 2221  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 2222  */
 2223 static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 2224                        int nblocks)
 2225 {
 2226         int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
 2227         dmtree_t *tp = (dmtree_t *) & dp->tree;
 2228         int size;
 2229 
 2230         /* determine the bit number and word within the dmap of the
 2231          * starting block.
 2232          */
 2233         dbitno = blkno & (BPERDMAP - 1);
 2234         word = dbitno >> L2DBWORD;
 2235 
 2236         /* block range better be within the dmap.
 2237          */
 2238         assert(dbitno + nblocks <= BPERDMAP);
 2239 
 2240         /* free the bits of the dmaps words corresponding to the block range.
 2241          * not all bits of the first and last words may be contained within
 2242          * the block range.  if this is the case, we'll work against those
 2243          * words (i.e. partial first and/or last) on an individual basis
 2244          * (a single pass), freeing the bits of interest by hand and updating
 2245          * the leaf corresponding to the dmap word. a single pass will be used
 2246          * for all dmap words fully contained within the specified range.  
 2247          * within this pass, the bits of all fully contained dmap words will
 2248          * be marked as free in a single shot and the leaves will be updated. a
 2249          * single leaf may describe the free space of multiple dmap words,
 2250          * so we may update only a subset of the actual leaves corresponding
 2251          * to the dmap words of the block range.
 2252          *
 2253          * dbJoin() is used to update leaf values and will join the binary
 2254          * buddy system of the leaves if the new leaf values indicate this
 2255          * should be done.
 2256          */
 2257         for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 2258                 /* determine the bit number within the word and
 2259                  * the number of bits within the word.
 2260                  */
 2261                 wbitno = dbitno & (DBWORD - 1);
 2262                 nb = min(rembits, DBWORD - wbitno);
 2263 
 2264                 /* check if only part of a word is to be freed.
 2265                  */
 2266                 if (nb < DBWORD) {
 2267                         /* free (zero) the appropriate bits within this
 2268                          * dmap word. 
 2269                          */
 2270                         dp->wmap[word] &=
 2271                             cpu_to_le32(~(ONES << (DBWORD - nb)
 2272                                           >> wbitno));
 2273 
 2274                         /* update the leaf for this dmap word.
 2275                          */
 2276                         dbJoin(tp, word,
 2277                                dbMaxBud((u8 *) & dp->wmap[word]));
 2278 
 2279                         word += 1;
 2280                 } else {
 2281                         /* one or more dmap words are fully contained
 2282                          * within the block range.  determine how many
 2283                          * words and free (zero) the bits of these words.
 2284                          */
 2285                         nwords = rembits >> L2DBWORD;
 2286                         memset(&dp->wmap[word], 0, nwords * 4);
 2287 
 2288                         /* determine how many bits.
 2289                          */
 2290                         nb = nwords << L2DBWORD;
 2291 
 2292                         /* now update the appropriate leaves to reflect
 2293                          * the freed words.
 2294                          */
 2295                         for (; nwords > 0; nwords -= nw) {
 2296                                 /* determine what the leaf value should be
 2297                                  * updated to as the minimum of the l2 number
 2298                                  * of bits being freed and the l2 (max) number
 2299                                  * of bits that can be described by this leaf.
 2300                                  */
 2301                                 size =
 2302                                     min(LITOL2BSZ
 2303                                         (word, L2LPERDMAP, BUDMIN),
 2304                                         NLSTOL2BSZ(nwords));
 2305 
 2306                                 /* update the leaf.
 2307                                  */
 2308                                 dbJoin(tp, word, size);
 2309 
 2310                                 /* get the number of dmap words handled.
 2311                                  */
 2312                                 nw = BUDSIZE(size, BUDMIN);
 2313                                 word += nw;
 2314                         }
 2315                 }
 2316         }
 2317 
 2318         /* update the free count for this dmap.
 2319          */
 2320         dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
 2321 
 2322         BMAP_LOCK(bmp);
 2323 
 2324         /* update the free count for the allocation group and 
 2325          * map.
 2326          */
 2327         agno = blkno >> bmp->db_agl2size;
 2328         bmp->db_nfree += nblocks;
 2329         bmp->db_agfree[agno] += nblocks;
 2330 
 2331         /* check if this allocation group is not completely free and
 2332          * if it is currently the maximum (rightmost) allocation group.
 2333          * if so, establish the new maximum allocation group number by
 2334          * searching left for the first allocation group with allocation.
 2335          */
 2336         if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
 2337             (agno == bmp->db_numag - 1 &&
 2338              bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
 2339                 while (bmp->db_maxag > 0) {
 2340                         bmp->db_maxag -= 1;
 2341                         if (bmp->db_agfree[bmp->db_maxag] !=
 2342                             bmp->db_agsize)
 2343                                 break;
 2344                 }
 2345 
 2346                 /* re-establish the allocation group preference if the
 2347                  * current preference is right of the maximum allocation
 2348                  * group.
 2349                  */
 2350                 if (bmp->db_agpref > bmp->db_maxag)
 2351                         bmp->db_agpref = bmp->db_maxag;
 2352         }
 2353 
 2354         BMAP_UNLOCK(bmp);
 2355 }
 2356 
 2357 
 2358 /*
 2359  * NAME:        dbAdjCtl()
 2360  *
 2361  * FUNCTION:    adjust a dmap control page at a specified level to reflect
 2362  *              the change in a lower level dmap or dmap control page's
 2363  *              maximum string of free blocks (i.e. a change in the root
 2364  *              of the lower level object's dmtree) due to the allocation
 2365  *              or deallocation of a range of blocks with a single dmap.
 2366  *
 2367  *              on entry, this routine is provided with the new value of
 2368  *              the lower level dmap or dmap control page root and the
 2369  *              starting block number of the block range whose allocation
 2370  *              or deallocation resulted in the root change.  this range
 2371  *              is respresented by a single leaf of the current dmapctl
 2372  *              and the leaf will be updated with this value, possibly
 2373  *              causing a binary buddy system within the leaves to be 
 2374  *              split or joined.  the update may also cause the dmapctl's
 2375  *              dmtree to be updated.
 2376  *
 2377  *              if the adjustment of the dmap control page, itself, causes its
 2378  *              root to change, this change will be bubbled up to the next dmap
 2379  *              control level by a recursive call to this routine, specifying
 2380  *              the new root value and the next dmap control page level to
 2381  *              be adjusted.
 2382  * PARAMETERS:
 2383  *      bmp     -  pointer to bmap descriptor
 2384  *      blkno   -  the first block of a block range within a dmap.  it is
 2385  *                 the allocation or deallocation of this block range that
 2386  *                 requires the dmap control page to be adjusted.
 2387  *      newval  -  the new value of the lower level dmap or dmap control
 2388  *                 page root.
 2389  *      alloc   -  TRUE if adjustment is due to an allocation.
 2390  *      level   -  current level of dmap control page (i.e. L0, L1, L2) to
 2391  *                 be adjusted.
 2392  *
 2393  * RETURN VALUES:
 2394  *      0       - success
 2395  *      EIO     - i/o error
 2396  *
 2397  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 2398  */
 2399 static int
 2400 dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
 2401 {
 2402         struct metapage *mp;
 2403         s8 oldroot;
 2404         int oldval;
 2405         s64 lblkno;
 2406         struct dmapctl *dcp;
 2407         int rc, leafno, ti;
 2408 
 2409         /* get the buffer for the dmap control page for the specified
 2410          * block number and control page level.
 2411          */
 2412         lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
 2413         mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
 2414         if (mp == NULL)
 2415                 return (EIO);
 2416         dcp = (struct dmapctl *) mp->data;
 2417 
 2418         /* determine the leaf number corresponding to the block and
 2419          * the index within the dmap control tree.
 2420          */
 2421         leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
 2422         ti = leafno + le32_to_cpu(dcp->leafidx);
 2423 
 2424         /* save the current leaf value and the current root level (i.e.
 2425          * maximum l2 free string described by this dmapctl).
 2426          */
 2427         oldval = dcp->stree[ti];
 2428         oldroot = dcp->stree[ROOT];
 2429 
 2430         /* check if this is a control page update for an allocation.
 2431          * if so, update the leaf to reflect the new leaf value using
 2432          * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
 2433          * the leaf with the new value.  in addition to updating the
 2434          * leaf, dbSplit() will also split the binary buddy system of
 2435          * the leaves, if required, and bubble new values within the
 2436          * dmapctl tree, if required.  similarly, dbJoin() will join
 2437          * the binary buddy system of leaves and bubble new values up
 2438          * the dmapctl tree as required by the new leaf value.
 2439          */
 2440         if (alloc) {
 2441                 /* check if we are in the middle of a binary buddy
 2442                  * system.  this happens when we are performing the
 2443                  * first allocation out of an allocation group that
 2444                  * is part (not the first part) of a larger binary
 2445                  * buddy system.  if we are in the middle, back split
 2446                  * the system prior to calling dbSplit() which assumes
 2447                  * that it is at the front of a binary buddy system.
 2448                  */
 2449                 if (oldval == NOFREE) {
 2450                         dbBackSplit((dmtree_t *) dcp, leafno);
 2451                         oldval = dcp->stree[ti];
 2452                 }
 2453                 dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
 2454         } else {
 2455                 dbJoin((dmtree_t *) dcp, leafno, newval);
 2456         }
 2457 
 2458         /* check if the root of the current dmap control page changed due
 2459          * to the update and if the current dmap control page is not at
 2460          * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
 2461          * root changed and this is not the top level), call this routine
 2462          * again (recursion) for the next higher level of the mapping to
 2463          * reflect the change in root for the current dmap control page.
 2464          */
 2465         if (dcp->stree[ROOT] != oldroot) {
 2466                 /* are we below the top level of the map.  if so,
 2467                  * bubble the root up to the next higher level.
 2468                  */
 2469                 if (level < bmp->db_maxlevel) {
 2470                         /* bubble up the new root of this dmap control page to
 2471                          * the next level.
 2472                          */
 2473                         if ((rc =
 2474                              dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
 2475                                       level + 1))) {
 2476                                 /* something went wrong in bubbling up the new
 2477                                  * root value, so backout the changes to the
 2478                                  * current dmap control page.
 2479                                  */
 2480                                 if (alloc) {
 2481                                         dbJoin((dmtree_t *) dcp, leafno,
 2482                                                oldval);
 2483                                 } else {
 2484                                         /* the dbJoin() above might have
 2485                                          * caused a larger binary buddy system
 2486                                          * to form and we may now be in the
 2487                                          * middle of it.  if this is the case,
 2488                                          * back split the buddies.
 2489                                          */
 2490                                         if (dcp->stree[ti] == NOFREE)
 2491                                                 dbBackSplit((dmtree_t *)
 2492                                                             dcp, leafno);
 2493                                         dbSplit((dmtree_t *) dcp, leafno,
 2494                                                 dcp->budmin, oldval);
 2495                                 }
 2496 
 2497                                 /* release the buffer and return the error.
 2498                                  */
 2499                                 release_metapage(mp);
 2500                                 return (rc);
 2501                         }
 2502                 } else {
 2503                         /* we're at the top level of the map. update
 2504                          * the bmap control page to reflect the size
 2505                          * of the maximum free buddy system.
 2506                          */
 2507                         assert(level == bmp->db_maxlevel);
 2508                         assert(bmp->db_maxfreebud == oldroot);
 2509                         bmp->db_maxfreebud = dcp->stree[ROOT];
 2510                 }
 2511         }
 2512 
 2513         /* write the buffer.
 2514          */
 2515         write_metapage(mp);
 2516 
 2517         return (0);
 2518 }
 2519 
 2520 
 2521 /*
 2522  * NAME:        dbSplit()
 2523  *
 2524  * FUNCTION:    update the leaf of a dmtree with a new value, splitting
 2525  *              the leaf from the binary buddy system of the dmtree's
 2526  *              leaves, as required.
 2527  *
 2528  * PARAMETERS:
 2529  *      tp      - pointer to the tree containing the leaf.
 2530  *      leafno  - the number of the leaf to be updated.
 2531  *      splitsz - the size the binary buddy system starting at the leaf
 2532  *                must be split to, specified as the log2 number of blocks.
 2533  *      newval  - the new value for the leaf.
 2534  *
 2535  * RETURN VALUES: none
 2536  *
 2537  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 2538  */
 2539 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
 2540 {
 2541         int budsz;
 2542         int cursz;
 2543         s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
 2544 
 2545         /* check if the leaf needs to be split.
 2546          */
 2547         if (leaf[leafno] > tp->dmt_budmin) {
 2548                 /* the split occurs by cutting the buddy system in half
 2549                  * at the specified leaf until we reach the specified
 2550                  * size.  pick up the starting split size (current size
 2551                  * - 1 in l2) and the corresponding buddy size.
 2552                  */
 2553                 cursz = leaf[leafno] - 1;
 2554                 budsz = BUDSIZE(cursz, tp->dmt_budmin);
 2555 
 2556                 /* split until we reach the specified size.
 2557                  */
 2558                 while (cursz >= splitsz) {
 2559                         /* update the buddy's leaf with its new value.
 2560                          */
 2561                         dbAdjTree(tp, leafno ^ budsz, cursz);
 2562 
 2563                         /* on to the next size and buddy.
 2564                          */
 2565                         cursz -= 1;
 2566                         budsz >>= 1;
 2567                 }
 2568         }
 2569 
 2570         /* adjust the dmap tree to reflect the specified leaf's new 
 2571          * value.
 2572          */
 2573         dbAdjTree(tp, leafno, newval);
 2574 }
 2575 
 2576 
 2577 /*
 2578  * NAME:        dbBackSplit()
 2579  *
 2580  * FUNCTION:    back split the binary buddy system of dmtree leaves
 2581  *              that hold a specified leaf until the specified leaf
 2582  *              starts its own binary buddy system.
 2583  *
 2584  *              the allocators typically perform allocations at the start
 2585  *              of binary buddy systems and dbSplit() is used to accomplish
 2586  *              any required splits.  in some cases, however, allocation
 2587  *              may occur in the middle of a binary system and requires a
 2588  *              back split, with the split proceeding out from the middle of
 2589  *              the system (less efficient) rather than the start of the
 2590  *              system (more efficient).  the cases in which a back split
 2591  *              is required are rare and are limited to the first allocation
 2592  *              within an allocation group which is a part (not first part)
 2593  *              of a larger binary buddy system and a few exception cases
 2594  *              in which a previous join operation must be backed out.
 2595  *
 2596  * PARAMETERS:
 2597  *      tp      - pointer to the tree containing the leaf.
 2598  *      leafno  - the number of the leaf to be updated.
 2599  *
 2600  * RETURN VALUES: none
 2601  *
 2602  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 2603  */
 2604 static void dbBackSplit(dmtree_t * tp, int leafno)
 2605 {
 2606         int budsz, bud, w, bsz, size;
 2607         int cursz;
 2608         s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
 2609 
 2610         /* leaf should be part (not first part) of a binary
 2611          * buddy system.
 2612          */
 2613         assert(leaf[leafno] == NOFREE);
 2614 
 2615         /* the back split is accomplished by iteratively finding the leaf
 2616          * that starts the buddy system that contains the specified leaf and
 2617          * splitting that system in two.  this iteration continues until
 2618          * the specified leaf becomes the start of a buddy system. 
 2619          *
 2620          * determine maximum possible l2 size for the specified leaf.
 2621          */
 2622         size =
 2623             LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
 2624                       tp->dmt_budmin);
 2625 
 2626         /* determine the number of leaves covered by this size.  this
 2627          * is the buddy size that we will start with as we search for
 2628          * the buddy system that contains the specified leaf.
 2629          */
 2630         budsz = BUDSIZE(size, tp->dmt_budmin);
 2631 
 2632         /* back split.
 2633          */
 2634         while (leaf[leafno] == NOFREE) {
 2635                 /* find the leftmost buddy leaf.
 2636                  */
 2637                 for (w = leafno, bsz = budsz;; bsz <<= 1,
 2638                      w = (w < bud) ? w : bud) {
 2639                         assert(bsz < le32_to_cpu(tp->dmt_nleafs));
 2640 
 2641                         /* determine the buddy.
 2642                          */
 2643                         bud = w ^ bsz;
 2644 
 2645                         /* check if this buddy is the start of the system.
 2646                          */
 2647                         if (leaf[bud] != NOFREE) {
 2648                                 /* split the leaf at the start of the
 2649                                  * system in two.
 2650                                  */
 2651                                 cursz = leaf[bud] - 1;
 2652                                 dbSplit(tp, bud, cursz, cursz);
 2653                                 break;
 2654                         }
 2655                 }
 2656         }
 2657 
 2658         assert(leaf[leafno] == size);
 2659 }
 2660 
 2661 
 2662 /*
 2663  * NAME:        dbJoin()
 2664  *
 2665  * FUNCTION:    update the leaf of a dmtree with a new value, joining
 2666  *              the leaf with other leaves of the dmtree into a multi-leaf
 2667  *              binary buddy system, as required.
 2668  *
 2669  * PARAMETERS:
 2670  *      tp      - pointer to the tree containing the leaf.
 2671  *      leafno  - the number of the leaf to be updated.
 2672  *      newval  - the new value for the leaf.
 2673  *
 2674  * RETURN VALUES: none
 2675  */
 2676 static void dbJoin(dmtree_t * tp, int leafno, int newval)
 2677 {
 2678         int budsz, buddy;
 2679         s8 *leaf;
 2680 
 2681         /* can the new leaf value require a join with other leaves ?
 2682          */
 2683         if (newval >= tp->dmt_budmin) {
 2684                 /* pickup a pointer to the leaves of the tree.
 2685                  */
 2686                 leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
 2687 
 2688                 /* try to join the specified leaf into a large binary
 2689                  * buddy system.  the join proceeds by attempting to join
 2690                  * the specified leafno with its buddy (leaf) at new value.
 2691                  * if the join occurs, we attempt to join the left leaf
 2692                  * of the joined buddies with its buddy at new value + 1.
 2693                  * we continue to join until we find a buddy that cannot be
 2694                  * joined (does not have a value equal to the size of the
 2695                  * last join) or until all leaves have been joined into a
 2696                  * single system.
 2697                  *
 2698                  * get the buddy size (number of words covered) of
 2699                  * the new value.
 2700                  */
 2701                 budsz = BUDSIZE(newval, tp->dmt_budmin);
 2702 
 2703                 /* try to join.
 2704                  */
 2705                 while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
 2706                         /* get the buddy leaf.
 2707                          */
 2708                         buddy = leafno ^ budsz;
 2709 
 2710                         /* if the leaf's new value is greater than its
 2711                          * buddy's value, we join no more.
 2712                          */
 2713                         if (newval > leaf[buddy])
 2714                                 break;
 2715 
 2716                         assert(newval == leaf[buddy]);
 2717 
 2718                         /* check which (leafno or buddy) is the left buddy.
 2719                          * the left buddy gets to claim the blocks resulting
 2720                          * from the join while the right gets to claim none.
 2721                          * the left buddy is also eligable to participate in
 2722                          * a join at the next higher level while the right
 2723                          * is not.
 2724                          *
 2725                          */
 2726                         if (leafno < buddy) {
 2727                                 /* leafno is the left buddy.
 2728                                  */
 2729                                 dbAdjTree(tp, buddy, NOFREE);
 2730                         } else {
 2731                                 /* buddy is the left buddy and becomes
 2732                                  * leafno.
 2733                                  */
 2734                                 dbAdjTree(tp, leafno, NOFREE);
 2735                                 leafno = buddy;
 2736                         }
 2737 
 2738                         /* on to try the next join.
 2739                          */
 2740                         newval += 1;
 2741                         budsz <<= 1;
 2742                 }
 2743         }
 2744 
 2745         /* update the leaf value.
 2746          */
 2747         dbAdjTree(tp, leafno, newval);
 2748 }
 2749 
 2750 
 2751 /*
 2752  * NAME:        dbAdjTree()
 2753  *
 2754  * FUNCTION:    update a leaf of a dmtree with a new value, adjusting
 2755  *              the dmtree, as required, to reflect the new leaf value.
 2756  *              the combination of any buddies must already be done before
 2757  *              this is called.
 2758  *
 2759  * PARAMETERS:
 2760  *      tp      - pointer to the tree to be adjusted.
 2761  *      leafno  - the number of the leaf to be updated.
 2762  *      newval  - the new value for the leaf.
 2763  *
 2764  * RETURN VALUES: none
 2765  */
 2766 static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
 2767 {
 2768         int lp, pp, k;
 2769         int max;
 2770 
 2771         /* pick up the index of the leaf for this leafno.
 2772          */
 2773         lp = leafno + le32_to_cpu(tp->dmt_leafidx);
 2774 
 2775         /* is the current value the same as the old value ?  if so,
 2776          * there is nothing to do.
 2777          */
 2778         if (tp->dmt_stree[lp] == newval)
 2779                 return;
 2780 
 2781         /* set the new value.
 2782          */
 2783         tp->dmt_stree[lp] = newval;
 2784 
 2785         /* bubble the new value up the tree as required.
 2786          */
 2787         for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
 2788                 /* get the index of the first leaf of the 4 leaf
 2789                  * group containing the specified leaf (leafno).
 2790                  */
 2791                 lp = ((lp - 1) & ~0x03) + 1;
 2792 
 2793                 /* get the index of the parent of this 4 leaf group.
 2794                  */
 2795                 pp = (lp - 1) >> 2;
 2796 
 2797                 /* determine the maximum of the 4 leaves.
 2798                  */
 2799                 max = TREEMAX(&tp->dmt_stree[lp]);
 2800 
 2801                 /* if the maximum of the 4 is the same as the
 2802                  * parent's value, we're done.
 2803                  */
 2804                 if (tp->dmt_stree[pp] == max)
 2805                         break;
 2806 
 2807                 /* parent gets new value.
 2808                  */
 2809                 tp->dmt_stree[pp] = max;
 2810 
 2811                 /* parent becomes leaf for next go-round.
 2812                  */
 2813                 lp = pp;
 2814         }
 2815 }
 2816 
 2817 
 2818 /*
 2819  * NAME:        dbFindLeaf()
 2820  *
 2821  * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
 2822  *              the index of a leaf describing the free blocks if 
 2823  *              sufficient free blocks are found.
 2824  *
 2825  *              the search starts at the top of the dmtree_t tree and
 2826  *              proceeds down the tree to the leftmost leaf with sufficient
 2827  *              free space.
 2828  *
 2829  * PARAMETERS:
 2830  *      tp      - pointer to the tree to be searched.
 2831  *      l2nb    - log2 number of free blocks to search for.
 2832  *      leafidx - return pointer to be set to the index of the leaf
 2833  *                describing at least l2nb free blocks if sufficient
 2834  *                free blocks are found.
 2835  *
 2836  * RETURN VALUES:
 2837  *      0       - success
 2838  *      ENOSPC  - insufficient free blocks. 
 2839  */
 2840 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 2841 {
 2842         int ti, n = 0, k, x = 0;
 2843 
 2844         /* first check the root of the tree to see if there is
 2845          * sufficient free space.
 2846          */
 2847         if (l2nb > tp->dmt_stree[ROOT])
 2848                 return (ENOSPC);
 2849 
 2850         /* sufficient free space available. now search down the tree
 2851          * starting at the next level for the leftmost leaf that
 2852          * describes sufficient free space.
 2853          */
 2854         for (k = le32_to_cpu(tp->dmt_height), ti = 1;
 2855              k > 0; k--, ti = ((ti + n) << 2) + 1) {
 2856                 /* search the four nodes at this level, starting from
 2857                  * the left.
 2858                  */
 2859                 for (x = ti, n = 0; n < 4; n++) {
 2860                         /* sufficient free space found.  move to the next
 2861                          * level (or quit if this is the last level).
 2862                          */
 2863                         if (l2nb <= tp->dmt_stree[x + n])
 2864                                 break;
 2865                 }
 2866 
 2867                 /* better have found something since the higher
 2868                  * levels of the tree said it was here.
 2869                  */
 2870                 assert(n < 4);
 2871         }
 2872 
 2873         /* set the return to the leftmost leaf describing sufficient
 2874          * free space.
 2875          */
 2876         *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
 2877 
 2878         return (0);
 2879 }
 2880 
 2881 
 2882 /*
 2883  * NAME:        dbFindBits()
 2884  *
 2885  * FUNCTION:    find a specified number of binary buddy free bits within a
 2886  *              dmap bitmap word value.
 2887  *
 2888  *              this routine searches the bitmap value for (1 << l2nb) free
 2889  *              bits at (1 << l2nb) alignments within the value.
 2890  *
 2891  * PARAMETERS:
 2892  *      word    -  dmap bitmap word value.
 2893  *      l2nb    -  number of free bits specified as a log2 number.
 2894  *
 2895  * RETURN VALUES:
 2896  *      starting bit number of free bits.
 2897  */
 2898 static int dbFindBits(u32 word, int l2nb)
 2899 {
 2900         int bitno, nb;
 2901         u32 mask;
 2902 
 2903         /* get the number of bits.
 2904          */
 2905         nb = 1 << l2nb;
 2906         assert(nb <= DBWORD);
 2907 
 2908         /* complement the word so we can use a mask (i.e. 0s represent
 2909          * free bits) and compute the mask.
 2910          */
 2911         word = ~word;
 2912         mask = ONES << (DBWORD - nb);
 2913 
 2914         /* scan the word for nb free bits at nb alignments.
 2915          */
 2916         for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
 2917                 if ((mask & word) == mask)
 2918                         break;
 2919         }
 2920 
 2921         ASSERT(bitno < 32);
 2922 
 2923         /* return the bit number.
 2924          */
 2925         return (bitno);
 2926 }
 2927 
 2928 
 2929 /*
 2930  * NAME:        dbMaxBud(u8 *cp)
 2931  *
 2932  * FUNCTION:    determine the largest binary buddy string of free
 2933  *              bits within 32-bits of the map.
 2934  *
 2935  * PARAMETERS:
 2936  *      cp      -  pointer to the 32-bit value.
 2937  *
 2938  * RETURN VALUES:
 2939  *      largest binary buddy of free bits within a dmap word.
 2940  */
 2941 static int dbMaxBud(u8 * cp)
 2942 {
 2943         signed char tmp1, tmp2;
 2944 
 2945         /* check if the wmap word is all free. if so, the
 2946          * free buddy size is BUDMIN.
 2947          */
 2948         if (*((uint *) cp) == 0)
 2949                 return (BUDMIN);
 2950 
 2951         /* check if the wmap word is half free. if so, the
 2952          * free buddy size is BUDMIN-1.
 2953          */
 2954         if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
 2955                 return (BUDMIN - 1);
 2956 
 2957         /* not all free or half free. determine the free buddy
 2958          * size thru table lookup using quarters of the wmap word.
 2959          */
 2960         tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
 2961         tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
 2962         return (max(tmp1, tmp2));
 2963 }
 2964 
 2965 
 2966 /*
 2967  * NAME:        cnttz(uint word)
 2968  *
 2969  * FUNCTION:    determine the number of trailing zeros within a 32-bit
 2970  *              value.
 2971  *
 2972  * PARAMETERS:
 2973  *      value   -  32-bit value to be examined.
 2974  *
 2975  * RETURN VALUES:
 2976  *      count of trailing zeros
 2977  */
 2978 int cnttz(u32 word)
 2979 {
 2980         int n;
 2981 
 2982         for (n = 0; n < 32; n++, word >>= 1) {
 2983                 if (word & 0x01)
 2984                         break;
 2985         }
 2986 
 2987         return (n);
 2988 }
 2989 
 2990 
 2991 /*
 2992  * NAME:        cntlz(u32 value)
 2993  *
 2994  * FUNCTION:    determine the number of leading zeros within a 32-bit
 2995  *              value.
 2996  *
 2997  * PARAMETERS:
 2998  *      value   -  32-bit value to be examined.
 2999  *
 3000  * RETURN VALUES:
 3001  *      count of leading zeros
 3002  */
 3003 int cntlz(u32 value)
 3004 {
 3005         int n;
 3006 
 3007         for (n = 0; n < 32; n++, value <<= 1) {
 3008                 if (value & HIGHORDER)
 3009                         break;
 3010         }
 3011         return (n);
 3012 }
 3013 
 3014 
 3015 /*
 3016  * NAME:        blkstol2(s64 nb)
 3017  *
 3018  * FUNCTION:    convert a block count to its log2 value. if the block
 3019  *              count is not a l2 multiple, it is rounded up to the next
 3020  *              larger l2 multiple.
 3021  *
 3022  * PARAMETERS:
 3023  *      nb      -  number of blocks
 3024  *
 3025  * RETURN VALUES:
 3026  *      log2 number of blocks
 3027  */
 3028 int blkstol2(s64 nb)
 3029 {
 3030         int l2nb;
 3031         s64 mask;               /* meant to be signed */
 3032 
 3033         mask = (s64) 1 << (64 - 1);
 3034 
 3035         /* count the leading bits.
 3036          */
 3037         for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
 3038                 /* leading bit found.
 3039                  */
 3040                 if (nb & mask) {
 3041                         /* determine the l2 value.
 3042                          */
 3043                         l2nb = (64 - 1) - l2nb;
 3044 
 3045                         /* check if we need to round up.
 3046                          */
 3047                         if (~mask & nb)
 3048                                 l2nb++;
 3049 
 3050                         return (l2nb);
 3051                 }
 3052         }
 3053         assert(0);
 3054         return 0;               /* fix compiler warning */
 3055 }
 3056 
 3057 
 3058 /*
 3059  * NAME:        fsDirty()
 3060  *
 3061  * FUNCTION:    xxx
 3062  *
 3063  * PARAMETERS:
 3064  *      ipmnt   - mount inode
 3065  *
 3066  * RETURN VALUES:
 3067  *      none
 3068  */
 3069 void fsDirty()
 3070 {
 3071         printk("fsDirty(): bye-bye\n");
 3072         assert(0);
 3073 }
 3074 
 3075 
 3076 /*
 3077  * NAME:        dbAllocBottomUp()
 3078  *
 3079  * FUNCTION:    alloc the specified block range from the working block
 3080  *              allocation map.
 3081  *
 3082  *              the blocks will be alloc from the working map one dmap
 3083  *              at a time.
 3084  *
 3085  * PARAMETERS:
 3086  *      ip      -  pointer to in-core inode;
 3087  *      blkno   -  starting block number to be freed.
 3088  *      nblocks -  number of blocks to be freed.
 3089  *
 3090  * RETURN VALUES:
 3091  *      0       - success
 3092  *      EIO     - i/o error
 3093  */
 3094 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
 3095 {
 3096         struct metapage *mp;
 3097         struct dmap *dp;
 3098         int nb, rc;
 3099         s64 lblkno, rem;
 3100         struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
 3101         struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 3102 
 3103         IREAD_LOCK(ipbmap);
 3104 
 3105         /* block to be allocated better be within the mapsize. */
 3106         ASSERT(nblocks <= bmp->db_mapsize - blkno);
 3107 
 3108         /*
 3109          * allocate the blocks a dmap at a time.
 3110          */
 3111         mp = NULL;
 3112         for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
 3113                 /* release previous dmap if any */
 3114                 if (mp) {
 3115                         write_metapage(mp);
 3116                 }
 3117 
 3118                 /* get the buffer for the current dmap. */
 3119                 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
 3120                 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 3121                 if (mp == NULL) {
 3122                         IREAD_UNLOCK(ipbmap);
 3123                         return (EIO);
 3124                 }
 3125                 dp = (struct dmap *) mp->data;
 3126 
 3127                 /* determine the number of blocks to be allocated from
 3128                  * this dmap.
 3129                  */
 3130                 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
 3131 
 3132                 DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
 3133 
 3134                 /* allocate the blocks. */
 3135                 if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
 3136                         release_metapage(mp);
 3137                         IREAD_UNLOCK(ipbmap);
 3138                         return (rc);
 3139                 }
 3140 
 3141                 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
 3142         }
 3143 
 3144         /* write the last buffer. */
 3145         write_metapage(mp);
 3146 
 3147         IREAD_UNLOCK(ipbmap);
 3148 
 3149         return (0);
 3150 }
 3151 
 3152 
 3153 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
 3154                          int nblocks)
 3155 {
 3156         int rc;
 3157         int dbitno, word, rembits, nb, nwords, wbitno, agno;
 3158         s8 oldroot, *leaf;
 3159         struct dmaptree *tp = (struct dmaptree *) & dp->tree;
 3160 
 3161         /* save the current value of the root (i.e. maximum free string)
 3162          * of the dmap tree.
 3163          */
 3164         oldroot = tp->stree[ROOT];
 3165 
 3166         /* pick up a pointer to the leaves of the dmap tree */
 3167         leaf = tp->stree + LEAFIND;
 3168 
 3169         /* determine the bit number and word within the dmap of the
 3170          * starting block.
 3171          */
 3172         dbitno = blkno & (BPERDMAP - 1);
 3173         word = dbitno >> L2DBWORD;
 3174 
 3175         /* block range better be within the dmap */
 3176         assert(dbitno + nblocks <= BPERDMAP);
 3177 
 3178         /* allocate the bits of the dmap's words corresponding to the block
 3179          * range. not all bits of the first and last words may be contained
 3180          * within the block range.  if this is the case, we'll work against
 3181          * those words (i.e. partial first and/or last) on an individual basis
 3182          * (a single pass), allocating the bits of interest by hand and
 3183          * updating the leaf corresponding to the dmap word. a single pass
 3184          * will be used for all dmap words fully contained within the
 3185          * specified range.  within this pass, the bits of all fully contained
 3186          * dmap words will be marked as free in a single shot and the leaves
 3187          * will be updated. a single leaf may describe the free space of
 3188          * multiple dmap words, so we may update only a subset of the actual
 3189          * leaves corresponding to the dmap words of the block range.
 3190          */
 3191         for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
 3192                 /* determine the bit number within the word and
 3193                  * the number of bits within the word.
 3194                  */
 3195                 wbitno = dbitno & (DBWORD - 1);
 3196                 nb = min(rembits, DBWORD - wbitno);
 3197 
 3198                 /* check if only part of a word is to be allocated.
 3199                  */
 3200                 if (nb < DBWORD) {
 3201                         /* allocate (set to 1) the appropriate bits within
 3202                          * this dmap word.
 3203                          */
 3204                         dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
 3205                                                       >> wbitno);
 3206 
 3207                         word++;
 3208                 } else {
 3209                         /* one or more dmap words are fully contained
 3210                          * within the block range.  determine how many
 3211                          * words and allocate (set to 1) the bits of these
 3212                          * words.
 3213                          */
 3214                         nwords = rembits >> L2DBWORD;
 3215                         memset(&dp->wmap[word], (int) ONES, nwords * 4);
 3216 
 3217                         /* determine how many bits */
 3218                         nb = nwords << L2DBWORD;
 3219                         word += nwords;
 3220                 }
 3221         }
 3222 
 3223         /* update the free count for this dmap */
 3224         dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
 3225 
 3226         /* reconstruct summary tree */
 3227         dbInitDmapTree(dp);
 3228 
 3229         BMAP_LOCK(bmp);
 3230 
 3231         /* if this allocation group is completely free,
 3232          * update the highest active allocation group number 
 3233          * if this allocation group is the new max.
 3234          */
 3235         agno = blkno >> bmp->db_agl2size;
 3236         if (agno > bmp->db_maxag)
 3237                 bmp->db_maxag = agno;
 3238 
 3239         /* update the free count for the allocation group and map */
 3240         bmp->db_agfree[agno] -= nblocks;
 3241         bmp->db_nfree -= nblocks;
 3242 
 3243         BMAP_UNLOCK(bmp);
 3244 
 3245         /* if the root has not changed, done. */
 3246         if (tp->stree[ROOT] == oldroot)
 3247                 return (0);
 3248 
 3249         /* root changed. bubble the change up to the dmap control pages.
 3250          * if the adjustment of the upper level control pages fails,
 3251          * backout the bit allocation (thus making everything consistent).
 3252          */
 3253         if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
 3254                 dbFreeBits(bmp, dp, blkno, nblocks);
 3255 
 3256         return (rc);
 3257 }
 3258 
 3259 
 3260 /*
 3261  * NAME:        dbExtendFS()
 3262  *
 3263  * FUNCTION:    extend bmap from blkno for nblocks;
 3264  *              dbExtendFS() updates bmap ready for dbAllocBottomUp();
 3265  *
 3266  * L2
 3267  *  |
 3268  *   L1---------------------------------L1
 3269  *    |                                  |
 3270  *     L0---------L0---------L0           L0---------L0---------L0
 3271  *      |          |          |            |          |          |
 3272  *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
 3273  * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
 3274  *
 3275  * <---old---><----------------------------extend----------------------->   
 3276  */
 3277 int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
 3278 {
 3279         struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
 3280         int nbperpage = sbi->nbperpage;
 3281         int i, i0 = TRUE, j, j0 = TRUE, k, n;
 3282         s64 newsize;
 3283         s64 p;
 3284         struct metapage *mp, *l2mp, *l1mp, *l0mp;
 3285         struct dmapctl *l2dcp, *l1dcp, *l0dcp;
 3286         struct dmap *dp;
 3287         s8 *l0leaf, *l1leaf, *l2leaf;
 3288         struct bmap *bmp = sbi->bmap;
 3289         int agno, l2agsize, oldl2agsize;
 3290         s64 ag_rem;
 3291 
 3292         newsize = blkno + nblocks;
 3293 
 3294         jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
 3295                  (long long) blkno, (long long) nblocks, (long long) newsize);
 3296 
 3297         /*
 3298          *      initialize bmap control page.
 3299          *
 3300          * all the data in bmap control page should exclude
 3301          * the mkfs hidden dmap page.
 3302          */
 3303 
 3304         /* update mapsize */
 3305         bmp->db_mapsize = newsize;
 3306         bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
 3307 
 3308         /* compute new AG size */
 3309         l2agsize = dbGetL2AGSize(newsize);
 3310         oldl2agsize = bmp->db_agl2size;
 3311 
 3312         bmp->db_agl2size = l2agsize;
 3313         bmp->db_agsize = 1 << l2agsize;
 3314 
 3315         /* compute new number of AG */
 3316         agno = bmp->db_numag;
 3317         bmp->db_numag = newsize >> l2agsize;
 3318         bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
 3319 
 3320         /*
 3321          *      reconfigure db_agfree[] 
 3322          * from old AG configuration to new AG configuration;
 3323          *
 3324          * coalesce contiguous k (newAGSize/oldAGSize) AGs;
 3325          * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
 3326          * note: new AG size = old AG size * (2**x).
 3327          */
 3328         if (l2agsize == oldl2agsize)
 3329                 goto extend;
 3330         k = 1 << (l2agsize - oldl2agsize);
 3331         ag_rem = bmp->db_agfree[0];     /* save agfree[0] */
 3332         for (i = 0, n = 0; i < agno; n++) {
 3333                 bmp->db_agfree[n] = 0;  /* init collection point */
 3334 
 3335                 /* coalesce cotiguous k AGs; */
 3336                 for (j = 0; j < k && i < agno; j++, i++) {
 3337                         /* merge AGi to AGn */
 3338                         bmp->db_agfree[n] += bmp->db_agfree[i];
 3339                 }
 3340         }
 3341         bmp->db_agfree[0] += ag_rem;    /* restore agfree[0] */
 3342 
 3343         for (; n < MAXAG; n++)
 3344                 bmp->db_agfree[n] = 0;
 3345 
 3346         /*
 3347          * update highest active ag number
 3348          */
 3349 
 3350         bmp->db_maxag = bmp->db_maxag / k;
 3351 
 3352         /*
 3353          *      extend bmap
 3354          *
 3355          * update bit maps and corresponding level control pages;
 3356          * global control page db_nfree, db_agfree[agno], db_maxfreebud;
 3357          */
 3358       extend:
 3359         /* get L2 page */
 3360         p = BMAPBLKNO + nbperpage;      /* L2 page */
 3361         l2mp = read_metapage(ipbmap, p, PSIZE, 0);
 3362         assert(l2mp);
 3363         l2dcp = (struct dmapctl *) l2mp->data;
 3364 
 3365         /* compute start L1 */
 3366         k = blkno >> L2MAXL1SIZE;
 3367         l2leaf = l2dcp->stree + CTLLEAFIND + k;
 3368         p = BLKTOL1(blkno, sbi->l2nbperpage);   /* L1 page */
 3369 
 3370         /*
 3371          * extend each L1 in L2
 3372          */
 3373         for (; k < LPERCTL; k++, p += nbperpage) {
 3374                 /* get L1 page */
 3375                 if (j0) {
 3376                         /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
 3377                         l1mp = read_metapage(ipbmap, p, PSIZE, 0);
 3378                         if (l1mp == NULL)
 3379                                 goto errout;
 3380                         l1dcp = (struct dmapctl *) l1mp->data;
 3381 
 3382                         /* compute start L0 */
 3383                         j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
 3384                         l1leaf = l1dcp->stree + CTLLEAFIND + j;
 3385                         p = BLKTOL0(blkno, sbi->l2nbperpage);
 3386                         j0 = FALSE;
 3387                 } else {
 3388                         /* assign/init L1 page */
 3389                         l1mp = get_metapage(ipbmap, p, PSIZE, 0);
 3390                         if (l1mp == NULL)
 3391                                 goto errout;
 3392 
 3393                         l1dcp = (struct dmapctl *) l1mp->data;
 3394 
 3395                         /* compute start L0 */
 3396                         j = 0;
 3397                         l1leaf = l1dcp->stree + CTLLEAFIND;
 3398                         p += nbperpage; /* 1st L0 of L1.k  */
 3399                 }
 3400 
 3401                 /*
 3402                  * extend each L0 in L1
 3403                  */
 3404                 for (; j < LPERCTL; j++) {
 3405                         /* get L0 page */
 3406                         if (i0) {
 3407                                 /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
 3408 
 3409                                 l0mp = read_metapage(ipbmap, p, PSIZE, 0);
 3410                                 if (l0mp == NULL)
 3411                                         goto errout;
 3412                                 l0dcp = (struct dmapctl *) l0mp->data;
 3413 
 3414                                 /* compute start dmap */
 3415                                 i = (blkno & (MAXL0SIZE - 1)) >>
 3416                                     L2BPERDMAP;
 3417                                 l0leaf = l0dcp->stree + CTLLEAFIND + i;
 3418                                 p = BLKTODMAP(blkno,
 3419                                               sbi->l2nbperpage);
 3420                                 i0 = FALSE;
 3421                         } else {
 3422                                 /* assign/init L0 page */
 3423                                 l0mp = get_metapage(ipbmap, p, PSIZE, 0);
 3424                                 if (l0mp == NULL)
 3425                                         goto errout;
 3426 
 3427                                 l0dcp = (struct dmapctl *) l0mp->data;
 3428 
 3429                                 /* compute start dmap */
 3430                                 i = 0;
 3431                                 l0leaf = l0dcp->stree + CTLLEAFIND;
 3432                                 p += nbperpage; /* 1st dmap of L0.j */
 3433                         }
 3434 
 3435                         /*
 3436                          * extend each dmap in L0
 3437                          */
 3438                         for (; i < LPERCTL; i++) {
 3439                                 /*
 3440                                  * reconstruct the dmap page, and
 3441                                  * initialize corresponding parent L0 leaf
 3442                                  */
 3443                                 if ((n = blkno & (BPERDMAP - 1))) {
 3444                                         /* read in dmap page: */
 3445                                         mp = read_metapage(ipbmap, p,
 3446                                                            PSIZE, 0);
 3447                                         if (mp == NULL)
 3448                                                 goto errout;
 3449                                         n = min(nblocks, (s64)BPERDMAP - n);
 3450                                 } else {
 3451                                         /* assign/init dmap page */
 3452                                         mp = read_metapage(ipbmap, p,
 3453                                                            PSIZE, 0);
 3454                                         if (mp == NULL)
 3455                                                 goto errout;
 3456 
 3457                                         n = min(nblocks, (s64)BPERDMAP);
 3458                                 }
 3459 
 3460                                 dp = (struct dmap *) mp->data;
 3461                                 *l0leaf = dbInitDmap(dp, blkno, n);
 3462 
 3463                                 bmp->db_nfree += n;
 3464                                 agno = le64_to_cpu(dp->start) >> l2agsize;
 3465                                 bmp->db_agfree[agno] += n;
 3466 
 3467                                 write_metapage(mp);
 3468 
 3469                                 l0leaf++;
 3470                                 p += nbperpage;
 3471 
 3472                                 blkno += n;
 3473                                 nblocks -= n;
 3474                                 if (nblocks == 0)
 3475                                         break;
 3476                         }       /* for each dmap in a L0 */
 3477 
 3478                         /*
 3479                          * build current L0 page from its leaves, and 
 3480                          * initialize corresponding parent L1 leaf
 3481                          */
 3482                         *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
 3483                         write_metapage(l0mp);
 3484 
 3485                         if (nblocks)
 3486                                 l1leaf++;       /* continue for next L0 */
 3487                         else {
 3488                                 /* more than 1 L0 ? */
 3489                                 if (j > 0)
 3490                                         break;  /* build L1 page */
 3491                                 else {
 3492                                         /* summarize in global bmap page */
 3493                                         bmp->db_maxfreebud = *l1leaf;
 3494                                         release_metapage(l1mp);
 3495                                         release_metapage(l2mp);
 3496                                         goto finalize;
 3497                                 }
 3498                         }
 3499                 }               /* for each L0 in a L1 */
 3500 
 3501                 /*
 3502                  * build current L1 page from its leaves, and 
 3503                  * initialize corresponding parent L2 leaf
 3504                  */
 3505                 *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
 3506                 write_metapage(l1mp);
 3507 
 3508                 if (nblocks)
 3509                         l2leaf++;       /* continue for next L1 */
 3510                 else {
 3511                         /* more than 1 L1 ? */
 3512                         if (k > 0)
 3513                                 break;  /* build L2 page */
 3514                         else {
 3515                                 /* summarize in global bmap page */
 3516                                 bmp->db_maxfreebud = *l2leaf;
 3517                                 release_metapage(l2mp);
 3518                                 goto finalize;
 3519                         }
 3520                 }
 3521         }                       /* for each L1 in a L2 */
 3522 
 3523         assert(0);
 3524 
 3525         /*
 3526          *      finalize bmap control page
 3527          */
 3528       finalize:
 3529 
 3530         return 0;
 3531 
 3532       errout:
 3533         return EIO;
 3534 }
 3535 
 3536 
 3537 /*
 3538  *      dbFinalizeBmap()
 3539  */
 3540 void dbFinalizeBmap(struct inode *ipbmap)
 3541 {
 3542         struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 3543         int actags, inactags, l2nl;
 3544         s64 ag_rem, actfree, inactfree, avgfree;
 3545         int i, n;
 3546 
 3547         /*
 3548          *      finalize bmap control page
 3549          */
 3550 //finalize:
 3551         /* 
 3552          * compute db_agpref: preferred ag to allocate from
 3553          * (the leftmost ag with average free space in it);
 3554          */
 3555 //agpref:
 3556         /* get the number of active ags and inacitve ags */
 3557         actags = bmp->db_maxag + 1;
 3558         inactags = bmp->db_numag - actags;
 3559         ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);        /* ??? */
 3560 
 3561         /* determine how many blocks are in the inactive allocation
 3562          * groups. in doing this, we must account for the fact that
 3563          * the rightmost group might be a partial group (i.e. file
 3564          * system size is not a multiple of the group size).
 3565          */
 3566         inactfree = (inactags && ag_rem) ?
 3567             ((inactags - 1) << bmp->db_agl2size) + ag_rem
 3568             : inactags << bmp->db_agl2size;
 3569 
 3570         /* determine how many free blocks are in the active
 3571          * allocation groups plus the average number of free blocks
 3572          * within the active ags.
 3573          */
 3574         actfree = bmp->db_nfree - inactfree;
 3575         avgfree = (u32) actfree / (u32) actags;
 3576 
 3577         /* if the preferred allocation group has not average free space.
 3578          * re-establish the preferred group as the leftmost
 3579          * group with average free space.
 3580          */
 3581         if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
 3582                 for (bmp->db_agpref = 0; bmp->db_agpref < actags;
 3583                      bmp->db_agpref++) {
 3584                         if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
 3585                                 break;
 3586                 }
 3587                 assert(bmp->db_agpref < bmp->db_numag);
 3588         }
 3589 
 3590         /*
 3591          * compute db_aglevel, db_agheigth, db_width, db_agstart:
 3592          * an ag is covered in aglevel dmapctl summary tree, 
 3593          * at agheight level height (from leaf) with agwidth number of nodes 
 3594          * each, which starts at agstart index node of the smmary tree node 
 3595          * array;
 3596          */
 3597         bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
 3598         l2nl =
 3599             bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
 3600         bmp->db_agheigth = l2nl >> 1;
 3601         bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
 3602         for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
 3603              i--) {
 3604                 bmp->db_agstart += n;
 3605                 n <<= 2;
 3606         }
 3607 
 3608 /*
 3609 printk("bmap: agpref:%d aglevel:%d agheigth:%d agwidth:%d\n",
 3610         bmp->db_agpref, bmp->db_aglevel, bmp->db_agheigth, bmp->db_agwidth);
 3611 */
 3612 }
 3613 
 3614 
 3615 /*
 3616  * NAME:        dbInitDmap()/ujfs_idmap_page()
 3617  *                                                                    
 3618  * FUNCTION:    initialize working/persistent bitmap of the dmap page
 3619  *              for the specified number of blocks:
 3620  *                                                                    
 3621  *              at entry, the bitmaps had been initialized as free (ZEROS);
 3622  *              The number of blocks will only account for the actually 
 3623  *              existing blocks. Blocks which don't actually exist in 
 3624  *              the aggregate will be marked as allocated (ONES);
 3625  *
 3626  * PARAMETERS:
 3627  *      dp      - pointer to page of map
 3628  *      nblocks - number of blocks this page
 3629  *
 3630  * RETURNS: NONE
 3631  */
 3632 static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
 3633 {
 3634         int blkno, w, b, r, nw, nb, i;
 3635 /*
 3636 printk("sbh_dmap:  in dbInitDmap blkno:%Ld nblocks:%ld\n", Blkno, nblocks); 
 3637 */
 3638 
 3639         /* starting block number within the dmap */
 3640         blkno = Blkno & (BPERDMAP - 1);
 3641 
 3642         if (blkno == 0) {
 3643                 dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
 3644                 dp->start = cpu_to_le64(Blkno);
 3645 
 3646                 if (nblocks == BPERDMAP) {
 3647                         memset(&dp->wmap[0], 0, LPERDMAP * 4);
 3648                         memset(&dp->pmap[0], 0, LPERDMAP * 4);
 3649                         goto initTree;
 3650                 }
 3651         } else {
 3652                 dp->nblocks =
 3653                     cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
 3654                 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
 3655         }
 3656 
 3657         /* word number containing start block number */
 3658         w = blkno >> L2DBWORD;
 3659 
 3660         /*
 3661          * free the bits corresponding to the block range (ZEROS):
 3662          * note: not all bits of the first and last words may be contained 
 3663          * within the block range.
 3664          */
 3665         for (r = nblocks; r > 0; r -= nb, blkno += nb) {
 3666                 /* number of bits preceding range to be freed in the word */
 3667                 b = blkno & (DBWORD - 1);
 3668                 /* number of bits to free in the word */
 3669                 nb = min(r, DBWORD - b);
 3670 
 3671                 /* is partial word to be freed ? */
 3672                 if (nb < DBWORD) {
 3673                         /* free (set to 0) from the bitmap word */
 3674                         dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
 3675                                                      >> b));
 3676                         dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
 3677                                                      >> b));
 3678 
 3679                         /* skip the word freed */
 3680                         w++;
 3681                 } else {
 3682                         /* free (set to 0) contiguous bitmap words */
 3683                         nw = r >> L2DBWORD;
 3684                         memset(&dp->wmap[w], 0, nw * 4);
 3685                         memset(&dp->pmap[w], 0, nw * 4);
 3686 
 3687                         /* skip the words freed */
 3688                         nb = nw << L2DBWORD;
 3689                         w += nw;
 3690                 }
 3691         }
 3692 
 3693         /*
 3694          * mark bits following the range to be freed (non-existing 
 3695          * blocks) as allocated (ONES)
 3696          */
 3697 /*
 3698 printk("sbh_dmap:  in dbInitDmap, preparing to mark unbacked, blkno:%ld nblocks:%ld\n",
 3699                 blkno, nblocks); 
 3700 */
 3701 
 3702         if (blkno == BPERDMAP)
 3703                 goto initTree;
 3704 
 3705         /* the first word beyond the end of existing blocks */
 3706         w = blkno >> L2DBWORD;
 3707 
 3708         /* does nblocks fall on a 32-bit boundary ? */
 3709         b = blkno & (DBWORD - 1);
 3710 /*
 3711 printk("sbh_dmap:  in dbInitDmap, b:%ld w:%ld mask: %lx\n", b, w, (ONES>>b)); 
 3712 */
 3713         if (b) {
 3714                 /* mark a partial word allocated */
 3715                 dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
 3716                 w++;
 3717         }
 3718 
 3719         /* set the rest of the words in the page to allocated (ONES) */
 3720         for (i = w; i < LPERDMAP; i++)
 3721                 dp->pmap[i] = dp->wmap[i] = ONES;
 3722 
 3723         /*
 3724          * init tree
 3725          */
 3726       initTree:
 3727         return (dbInitDmapTree(dp));
 3728 }
 3729 
 3730 
 3731 /*
 3732  * NAME:        dbInitDmapTree()/ujfs_complete_dmap()
 3733  *                                                                    
 3734  * FUNCTION:    initialize summary tree of the specified dmap:
 3735  *
 3736  *              at entry, bitmap of the dmap has been initialized;
 3737  *                                                                    
 3738  * PARAMETERS:
 3739  *      dp      - dmap to complete
 3740  *      blkno   - starting block number for this dmap
 3741  *      treemax - will be filled in with max free for this dmap
 3742  *
 3743  * RETURNS:     max free string at the root of the tree
 3744  */
 3745 static int dbInitDmapTree(struct dmap * dp)
 3746 {
 3747         struct dmaptree *tp;
 3748         s8 *cp;
 3749         int i;
 3750 
 3751         /* init fixed info of tree */
 3752         tp = &dp->tree;
 3753         tp->nleafs = cpu_to_le32(LPERDMAP);
 3754         tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
 3755         tp->leafidx = cpu_to_le32(LEAFIND);
 3756         tp->height = cpu_to_le32(4);
 3757         tp->budmin = BUDMIN;
 3758 
 3759         /* init each leaf from corresponding wmap word:
 3760          * note: leaf is set to NOFREE(-1) if all blocks of corresponding
 3761          * bitmap word are allocated. 
 3762          */
 3763         cp = tp->stree + le32_to_cpu(tp->leafidx);
 3764         for (i = 0; i < LPERDMAP; i++)
 3765                 *cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
 3766 
 3767         /* build the dmap's binary buddy summary tree */
 3768         return (dbInitTree(tp));
 3769 }
 3770 
 3771 
 3772 /*
 3773  * NAME:        dbInitTree()/ujfs_adjtree()
 3774  *                                                                    
 3775  * FUNCTION:    initialize binary buddy summary tree of a dmap or dmapctl.
 3776  *
 3777  *              at entry, the leaves of the tree has been initialized 
 3778  *              from corresponding bitmap word or root of summary tree
 3779  *              of the child control page;
 3780  *              configure binary buddy system at the leaf level, then
 3781  *              bubble up the values of the leaf nodes up the tree.
 3782  *
 3783  * PARAMETERS:
 3784  *      cp      - Pointer to the root of the tree
 3785  *      l2leaves- Number of leaf nodes as a power of 2
 3786  *      l2min   - Number of blocks that can be covered by a leaf
 3787  *                as a power of 2
 3788  *
 3789  * RETURNS: max free string at the root of the tree
 3790  */
 3791 static int dbInitTree(struct dmaptree * dtp)
 3792 {
 3793         int l2max, l2free, bsize, nextb, i;
 3794         int child, parent, nparent;
 3795         s8 *tp, *cp, *cp1;
 3796 
 3797         tp = dtp->stree;
 3798 
 3799         /* Determine the maximum free string possible for the leaves */
 3800         l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
 3801 
 3802         /*
 3803          * configure the leaf levevl into binary buddy system
 3804          *
 3805          * Try to combine buddies starting with a buddy size of 1 
 3806          * (i.e. two leaves). At a buddy size of 1 two buddy leaves 
 3807          * can be combined if both buddies have a maximum free of l2min; 
 3808          * the combination will result in the left-most buddy leaf having 
 3809          * a maximum free of l2min+1.  
 3810          * After processing all buddies for a given size, process buddies 
 3811          * at the next higher buddy size (i.e. current size * 2) and 
 3812          * the next maximum free (current free + 1).  
 3813          * This continues until the maximum possible buddy combination 
 3814          * yields maximum free.
 3815          */
 3816         for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
 3817              l2free++, bsize = nextb) {
 3818                 /* get next buddy size == current buddy pair size */
 3819                 nextb = bsize << 1;
 3820 
 3821                 /* scan each adjacent buddy pair at current buddy size */
 3822                 for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
 3823                      i < le32_to_cpu(dtp->nleafs);
 3824                      i += nextb, cp += nextb) {
 3825                         /* coalesce if both adjacent buddies are max free */
 3826                         if (*cp == l2free && *(cp + bsize) == l2free) {
 3827                                 *cp = l2free + 1;       /* left take right */
 3828                                 *(cp + bsize) = -1;     /* right give left */
 3829                         }
 3830                 }
 3831         }
 3832 
 3833         /*
 3834          * bubble summary information of leaves up the tree.
 3835          *
 3836          * Starting at the leaf node level, the four nodes described by
 3837          * the higher level parent node are compared for a maximum free and 
 3838          * this maximum becomes the value of the parent node.  
 3839          * when all lower level nodes are processed in this fashion then 
 3840          * move up to the next level (parent becomes a lower level node) and 
 3841          * continue the process for that level.
 3842          */
 3843         for (child = le32_to_cpu(dtp->leafidx),
 3844              nparent = le32_to_cpu(dtp->nleafs) >> 2;
 3845              nparent > 0; nparent >>= 2, child = parent) {
 3846                 /* get index of 1st node of parent level */
 3847                 parent = (child - 1) >> 2;
 3848 
 3849                 /* set the value of the parent node as the maximum 
 3850                  * of the four nodes of the current level.
 3851                  */
 3852                 for (i = 0, cp = tp + child, cp1 = tp + parent;
 3853                      i < nparent; i++, cp += 4, cp1++)
 3854                         *cp1 = TREEMAX(cp);
 3855         }
 3856 
 3857         return (*tp);
 3858 }
 3859 
 3860 
 3861 /*
 3862  *      dbInitDmapCtl()
 3863  *
 3864  * function: initialize dmapctl page
 3865  */
 3866 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
 3867 {                               /* start leaf index not covered by range */
 3868         s8 *cp;
 3869 
 3870         dcp->nleafs = cpu_to_le32(LPERCTL);
 3871         dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
 3872         dcp->leafidx = cpu_to_le32(CTLLEAFIND);
 3873         dcp->height = cpu_to_le32(5);
 3874         dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
 3875 
 3876         /*
 3877          * initialize the leaves of current level that were not covered 
 3878          * by the specified input block range (i.e. the leaves have no 
 3879          * low level dmapctl or dmap).
 3880          */
 3881         cp = &dcp->stree[CTLLEAFIND + i];
 3882         for (; i < LPERCTL; i++)
 3883                 *cp++ = NOFREE;
 3884 
 3885         /* build the dmap's binary buddy summary tree */
 3886         return (dbInitTree((struct dmaptree *) dcp));
 3887 }
 3888 
 3889 
 3890 /*
 3891  * NAME:        dbGetL2AGSize()/ujfs_getagl2size()
 3892  *                                                                    
 3893  * FUNCTION:    Determine log2(allocation group size) from aggregate size
 3894  *                                                                    
 3895  * PARAMETERS:
 3896  *      nblocks - Number of blocks in aggregate
 3897  *
 3898  * RETURNS: log2(allocation group size) in aggregate blocks
 3899  */
 3900 static int dbGetL2AGSize(s64 nblocks)
 3901 {
 3902         s64 sz;
 3903         s64 m;
 3904         int l2sz;
 3905 
 3906         if (nblocks < BPERDMAP * MAXAG)
 3907                 return (L2BPERDMAP);
 3908 
 3909         /* round up aggregate size to power of 2 */
 3910         m = ((u64) 1 << (64 - 1));
 3911         for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
 3912                 if (m & nblocks)
 3913                         break;
 3914         }
 3915 
 3916         sz = (s64) 1 << l2sz;
 3917         if (sz < nblocks)
 3918                 l2sz += 1;
 3919 
 3920         /* agsize = roundupSize/max_number_of_ag */
 3921         return (l2sz - L2MAXAG);
 3922 }
 3923 
 3924 
 3925 /*
 3926  * NAME:        dbMapFileSizeToMapSize()
 3927  *                                                                    
 3928  * FUNCTION:    compute number of blocks the block allocation map file 
 3929  *              can cover from the map file size;
 3930  *
 3931  * RETURNS:     Number of blocks which can be covered by this block map file;
 3932  */
 3933 
 3934 /*
 3935  * maximum number of map pages at each level including control pages
 3936  */
 3937 #define MAXL0PAGES      (1 + LPERCTL)
 3938 #define MAXL1PAGES      (1 + LPERCTL * MAXL0PAGES)
 3939 #define MAXL2PAGES      (1 + LPERCTL * MAXL1PAGES)
 3940 
 3941 /*
 3942  * convert number of map pages to the zero origin top dmapctl level
 3943  */
 3944 #define BMAPPGTOLEV(npages)     \
 3945         (((npages) <= 3 + MAXL0PAGES) ? 0 \
 3946        : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
 3947 
 3948 s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
 3949 {
 3950         struct super_block *sb = ipbmap->i_sb;
 3951         s64 nblocks;
 3952         s64 npages, ndmaps;
 3953         int level, i;
 3954         int complete, factor;
 3955 
 3956         nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
 3957         npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
 3958         level = BMAPPGTOLEV(npages);
 3959 
 3960         /* At each level, accumulate the number of dmap pages covered by 
 3961          * the number of full child levels below it;
 3962          * repeat for the last incomplete child level.
 3963          */
 3964         ndmaps = 0;
 3965         npages--;               /* skip the first global control page */
 3966         /* skip higher level control pages above top level covered by map */
 3967         npages -= (2 - level);
 3968         npages--;               /* skip top level's control page */
 3969         for (i = level; i >= 0; i--) {
 3970                 factor =
 3971                     (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
 3972                 complete = (u32) npages / factor;
 3973                 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
 3974                                       : ((i == 1) ? LPERCTL : 1));
 3975 
 3976                 /* pages in last/incomplete child */
 3977                 npages = (u32) npages % factor;
 3978                 /* skip incomplete child's level control page */
 3979                 npages--;
 3980         }
 3981 
 3982         /* convert the number of dmaps into the number of blocks 
 3983          * which can be covered by the dmaps;
 3984          */
 3985         nblocks = ndmaps << L2BPERDMAP;
 3986 
 3987         return (nblocks);
 3988 }
 3989 
 3990 
 3991 #ifdef  _JFS_DEBUG_DMAP
 3992 /*
 3993  *      DBinitmap()
 3994  */
 3995 static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results)
 3996 {
 3997         int npages;
 3998         u32 *dbmap, *d;
 3999         int n;
 4000         s64 lblkno, cur_block;
 4001         struct dmap *dp;
 4002         struct metapage *mp;
 4003 
 4004         npages = size / 32768;
 4005         npages += (size % 32768) ? 1 : 0;
 4006 
 4007         dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap);
 4008         if (dbmap == NULL)
 4009                 assert(0);
 4010 
 4011         for (n = 0, d = dbmap; n < npages; n++, d += 1024)
 4012                 bzero(d, 4096);
 4013 
 4014         /* Need to initialize from disk map pages
 4015          */
 4016         for (d = dbmap, cur_block = 0; cur_block < size;
 4017              cur_block += BPERDMAP, d += LPERDMAP) {
 4018                 lblkno = BLKTODMAP(cur_block,
 4019                                    JFS_SBI(ipbmap->i_sb)->bmap->
 4020                                    db_l2nbperpage);
 4021                 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
 4022                 if (mp == NULL) {
 4023                         assert(0);
 4024                 }
 4025                 dp = (struct dmap *) mp->data;
 4026 
 4027                 for (n = 0; n < LPERDMAP; n++)
 4028                         d[n] = le32_to_cpu(dp->wmap[n]);
 4029 
 4030                 release_metapage(mp);
 4031         }
 4032 
 4033         *results = dbmap;
 4034 }
 4035 
 4036 
 4037 /*
 4038  *      DBAlloc()
 4039  */
 4040 void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
 4041 {
 4042         int word, nb, bitno;
 4043         u32 mask;
 4044 
 4045         assert(blkno > 0 && blkno < mapsize);
 4046         assert(nblocks > 0 && nblocks <= mapsize);
 4047 
 4048         assert(blkno + nblocks <= mapsize);
 4049 
 4050         dbmap += (blkno / 32);
 4051         while (nblocks > 0) {
 4052                 bitno = blkno & (32 - 1);
 4053                 nb = min(nblocks, 32 - bitno);
 4054 
 4055                 mask = (0xffffffff << (32 - nb) >> bitno);
 4056                 assert((mask & *dbmap) == 0);
 4057                 *dbmap |= mask;
 4058 
 4059                 dbmap++;
 4060                 blkno += nb;
 4061                 nblocks -= nb;
 4062         }
 4063 }
 4064 
 4065 
 4066 /*
 4067  *      DBFree()
 4068  */
 4069 static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
 4070 {
 4071         int word, nb, bitno;
 4072         u32 mask;
 4073 
 4074         assert(blkno > 0 && blkno < mapsize);
 4075         assert(nblocks > 0 && nblocks <= mapsize);
 4076 
 4077         assert(blkno + nblocks <= mapsize);
 4078 
 4079         dbmap += (blkno / 32);
 4080         while (nblocks > 0) {
 4081                 bitno = blkno & (32 - 1);
 4082                 nb = min(nblocks, 32 - bitno);
 4083 
 4084                 mask = (0xffffffff << (32 - nb) >> bitno);
 4085                 assert((mask & *dbmap) == mask);
 4086                 *dbmap &= ~mask;
 4087 
 4088                 dbmap++;
 4089                 blkno += nb;
 4090                 nblocks -= nb;
 4091         }
 4092 }
 4093 
 4094 
 4095 /*
 4096  *      DBAllocCK()
 4097  */
 4098 static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
 4099 {
 4100         int word, nb, bitno;
 4101         u32 mask;
 4102 
 4103         assert(blkno > 0 && blkno < mapsize);
 4104         assert(nblocks > 0 && nblocks <= mapsize);
 4105 
 4106         assert(blkno + nblocks <= mapsize);
 4107 
 4108         dbmap += (blkno / 32);
 4109         while (nblocks > 0) {
 4110                 bitno = blkno & (32 - 1);
 4111                 nb = min(nblocks, 32 - bitno);
 4112 
 4113                 mask = (0xffffffff << (32 - nb) >> bitno);
 4114                 assert((mask & *dbmap) == mask);
 4115 
 4116                 dbmap++;
 4117                 blkno += nb;
 4118                 nblocks -= nb;
 4119         }
 4120 }
 4121 
 4122 
 4123 /*
 4124  *      DBFreeCK()
 4125  */
 4126 static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
 4127 {
 4128         int word, nb, bitno;
 4129         u32 mask;
 4130 
 4131         assert(blkno > 0 && blkno < mapsize);
 4132         assert(nblocks > 0 && nblocks <= mapsize);
 4133 
 4134         assert(blkno + nblocks <= mapsize);
 4135 
 4136         dbmap += (blkno / 32);
 4137         while (nblocks > 0) {
 4138                 bitno = blkno & (32 - 1);
 4139                 nb = min(nblocks, 32 - bitno);
 4140 
 4141                 mask = (0xffffffff << (32 - nb) >> bitno);
 4142                 assert((mask & *dbmap) == 0);
 4143 
 4144                 dbmap++;
 4145                 blkno += nb;
 4146                 nblocks -= nb;
 4147         }
 4148 }
 4149 
 4150 
 4151 /*
 4152  *      dbPrtMap()
 4153  */
 4154 static void dbPrtMap(struct bmap * bmp)
 4155 {
 4156         printk("   mapsize:   %d%d\n", bmp->db_mapsize);
 4157         printk("   nfree:     %d%d\n", bmp->db_nfree);
 4158         printk("   numag:     %d\n", bmp->db_numag);
 4159         printk("   agsize:    %d%d\n", bmp->db_agsize);
 4160         printk("   agl2size:  %d\n", bmp->db_agl2size);
 4161         printk("   agwidth:   %d\n", bmp->db_agwidth);
 4162         printk("   agstart:   %d\n", bmp->db_agstart);
 4163         printk("   agheigth:  %d\n", bmp->db_agheigth);
 4164         printk("   aglevel:   %d\n", bmp->db_aglevel);
 4165         printk("   maxlevel:  %d\n", bmp->db_maxlevel);
 4166         printk("   maxag:     %d\n", bmp->db_maxag);
 4167         printk("   agpref:    %d\n", bmp->db_agpref);
 4168         printk("   l2nbppg:   %d\n", bmp->db_l2nbperpage);
 4169 }
 4170 
 4171 
 4172 /*
 4173  *      dbPrtCtl()
 4174  */
 4175 static void dbPrtCtl(struct dmapctl * dcp)
 4176 {
 4177         int i, j, n;
 4178 
 4179         printk("   height:    %08x\n", le32_to_cpu(dcp->height));
 4180         printk("   leafidx:   %08x\n", le32_to_cpu(dcp->leafidx));
 4181         printk("   budmin:    %08x\n", dcp->budmin);
 4182         printk("   nleafs:    %08x\n", le32_to_cpu(dcp->nleafs));
 4183         printk("   l2nleafs:  %08x\n", le32_to_cpu(dcp->l2nleafs));
 4184 
 4185         printk("\n Tree:\n");
 4186         for (i = 0; i < CTLLEAFIND; i += 8) {
 4187                 n = min(8, CTLLEAFIND - i);
 4188 
 4189                 for (j = 0; j < n; j++)
 4190                         printf("  [%03x]: %02x", i + j,
 4191                                (char) dcp->stree[i + j]);
 4192                 printf("\n");
 4193         }
 4194 
 4195         printk("\n Tree Leaves:\n");
 4196         for (i = 0; i < LPERCTL; i += 8) {
 4197                 n = min(8, LPERCTL - i);
 4198 
 4199                 for (j = 0; j < n; j++)
 4200                         printf("  [%03x]: %02x",
 4201                                i + j,
 4202                                (char) dcp->stree[i + j + CTLLEAFIND]);
 4203                 printf("\n");
 4204         }
 4205 }
 4206 #endif                          /* _JFS_DEBUG_DMAP */

Cache object: cbef284ced20a9e2f53a5402cc80e03e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.