namei.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  *  linux/fs/namei.c
    3  *
    4  *  Copyright (C) 1991, 1992  Linus Torvalds
    5  */
    6 
    7 /*
    8  * Some corrections by tytso.
    9  */
   10 
   11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
   12  * lookup logic.
   13  */
   14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
   15  */
   16 
   17 #include <linux/init.h>
   18 #include <linux/export.h>
   19 #include <linux/kernel.h>
   20 #include <linux/slab.h>
   21 #include <linux/fs.h>
   22 #include <linux/namei.h>
   23 #include <linux/pagemap.h>
   24 #include <linux/fsnotify.h>
   25 #include <linux/personality.h>
   26 #include <linux/security.h>
   27 #include <linux/ima.h>
   28 #include <linux/syscalls.h>
   29 #include <linux/mount.h>
   30 #include <linux/audit.h>
   31 #include <linux/capability.h>
   32 #include <linux/file.h>
   33 #include <linux/fcntl.h>
   34 #include <linux/device_cgroup.h>
   35 #include <linux/fs_struct.h>
   36 #include <linux/posix_acl.h>
   37 #include <asm/uaccess.h>
   38 
   39 #include "internal.h"
   40 #include "mount.h"
   41 
   42 /* [Feb-1997 T. Schoebel-Theuer]
   43  * Fundamental changes in the pathname lookup mechanisms (namei)
   44  * were necessary because of omirr.  The reason is that omirr needs
   45  * to know the _real_ pathname, not the user-supplied one, in case
   46  * of symlinks (and also when transname replacements occur).
   47  *
   48  * The new code replaces the old recursive symlink resolution with
   49  * an iterative one (in case of non-nested symlink chains).  It does
   50  * this with calls to <fs>_follow_link().
   51  * As a side effect, dir_namei(), _namei() and follow_link() are now 
   52  * replaced with a single function lookup_dentry() that can handle all 
   53  * the special cases of the former code.
   54  *
   55  * With the new dcache, the pathname is stored at each inode, at least as
   56  * long as the refcount of the inode is positive.  As a side effect, the
   57  * size of the dcache depends on the inode cache and thus is dynamic.
   58  *
   59  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
   60  * resolution to correspond with current state of the code.
   61  *
   62  * Note that the symlink resolution is not *completely* iterative.
   63  * There is still a significant amount of tail- and mid- recursion in
   64  * the algorithm.  Also, note that <fs>_readlink() is not used in
   65  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
   66  * may return different results than <fs>_follow_link().  Many virtual
   67  * filesystems (including /proc) exhibit this behavior.
   68  */
   69 
   70 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
   71  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
   72  * and the name already exists in form of a symlink, try to create the new
   73  * name indicated by the symlink. The old code always complained that the
   74  * name already exists, due to not following the symlink even if its target
   75  * is nonexistent.  The new semantics affects also mknod() and link() when
   76  * the name is a symlink pointing to a non-existent name.
   77  *
   78  * I don't know which semantics is the right one, since I have no access
   79  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
   80  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
   81  * "old" one. Personally, I think the new semantics is much more logical.
   82  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
   83  * file does succeed in both HP-UX and SunOs, but not in Solaris
   84  * and in the old Linux semantics.
   85  */
   86 
   87 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
   88  * semantics.  See the comments in "open_namei" and "do_link" below.
   89  *
   90  * [10-Sep-98 Alan Modra] Another symlink change.
   91  */
   92 
   93 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
   94  *      inside the path - always follow.
   95  *      in the last component in creation/removal/renaming - never follow.
   96  *      if LOOKUP_FOLLOW passed - follow.
   97  *      if the pathname has trailing slashes - follow.
   98  *      otherwise - don't follow.
   99  * (applied in that order).
  100  *
  101  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
  102  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
  103  * During the 2.4 we need to fix the userland stuff depending on it -
  104  * hopefully we will be able to get rid of that wart in 2.5. So far only
  105  * XEmacs seems to be relying on it...
  106  */
  107 /*
  108  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
  109  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
  110  * any extra contention...
  111  */
  112 
  113 /* In order to reduce some races, while at the same time doing additional
  114  * checking and hopefully speeding things up, we copy filenames to the
  115  * kernel data space before using them..
  116  *
  117  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
  118  * PATH_MAX includes the nul terminator --RR.
  119  */
  120 void final_putname(struct filename *name)
  121 {
  122         if (name->separate) {
  123                 __putname(name->name);
  124                 kfree(name);
  125         } else {
  126                 __putname(name);
  127         }
  128 }
  129 
  130 #define EMBEDDED_NAME_MAX       (PATH_MAX - sizeof(struct filename))
  131 
  132 static struct filename *
  133 getname_flags(const char __user *filename, int flags, int *empty)
  134 {
  135         struct filename *result, *err;
  136         int len;
  137         long max;
  138         char *kname;
  139 
  140         result = audit_reusename(filename);
  141         if (result)
  142                 return result;
  143 
  144         result = __getname();
  145         if (unlikely(!result))
  146                 return ERR_PTR(-ENOMEM);
  147 
  148         /*
  149          * First, try to embed the struct filename inside the names_cache
  150          * allocation
  151          */
  152         kname = (char *)result + sizeof(*result);
  153         result->name = kname;
  154         result->separate = false;
  155         max = EMBEDDED_NAME_MAX;
  156 
  157 recopy:
  158         len = strncpy_from_user(kname, filename, max);
  159         if (unlikely(len < 0)) {
  160                 err = ERR_PTR(len);
  161                 goto error;
  162         }
  163 
  164         /*
  165          * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
  166          * separate struct filename so we can dedicate the entire
  167          * names_cache allocation for the pathname, and re-do the copy from
  168          * userland.
  169          */
  170         if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
  171                 kname = (char *)result;
  172 
  173                 result = kzalloc(sizeof(*result), GFP_KERNEL);
  174                 if (!result) {
  175                         err = ERR_PTR(-ENOMEM);
  176                         result = (struct filename *)kname;
  177                         goto error;
  178                 }
  179                 result->name = kname;
  180                 result->separate = true;
  181                 max = PATH_MAX;
  182                 goto recopy;
  183         }
  184 
  185         /* The empty path is special. */
  186         if (unlikely(!len)) {
  187                 if (empty)
  188                         *empty = 1;
  189                 err = ERR_PTR(-ENOENT);
  190                 if (!(flags & LOOKUP_EMPTY))
  191                         goto error;
  192         }
  193 
  194         err = ERR_PTR(-ENAMETOOLONG);
  195         if (unlikely(len >= PATH_MAX))
  196                 goto error;
  197 
  198         result->uptr = filename;
  199         audit_getname(result);
  200         return result;
  201 
  202 error:
  203         final_putname(result);
  204         return err;
  205 }
  206 
  207 struct filename *
  208 getname(const char __user * filename)
  209 {
  210         return getname_flags(filename, 0, NULL);
  211 }
  212 EXPORT_SYMBOL(getname);
  213 
  214 #ifdef CONFIG_AUDITSYSCALL
  215 void putname(struct filename *name)
  216 {
  217         if (unlikely(!audit_dummy_context()))
  218                 return audit_putname(name);
  219         final_putname(name);
  220 }
  221 #endif
  222 
  223 static int check_acl(struct inode *inode, int mask)
  224 {
  225 #ifdef CONFIG_FS_POSIX_ACL
  226         struct posix_acl *acl;
  227 
  228         if (mask & MAY_NOT_BLOCK) {
  229                 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
  230                 if (!acl)
  231                         return -EAGAIN;
  232                 /* no ->get_acl() calls in RCU mode... */
  233                 if (acl == ACL_NOT_CACHED)
  234                         return -ECHILD;
  235                 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
  236         }
  237 
  238         acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
  239 
  240         /*
  241          * A filesystem can force a ACL callback by just never filling the
  242          * ACL cache. But normally you'd fill the cache either at inode
  243          * instantiation time, or on the first ->get_acl call.
  244          *
  245          * If the filesystem doesn't have a get_acl() function at all, we'll
  246          * just create the negative cache entry.
  247          */
  248         if (acl == ACL_NOT_CACHED) {
  249                 if (inode->i_op->get_acl) {
  250                         acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
  251                         if (IS_ERR(acl))
  252                                 return PTR_ERR(acl);
  253                 } else {
  254                         set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
  255                         return -EAGAIN;
  256                 }
  257         }
  258 
  259         if (acl) {
  260                 int error = posix_acl_permission(inode, acl, mask);
  261                 posix_acl_release(acl);
  262                 return error;
  263         }
  264 #endif
  265 
  266         return -EAGAIN;
  267 }
  268 
  269 /*
  270  * This does the basic permission checking
  271  */
  272 static int acl_permission_check(struct inode *inode, int mask)
  273 {
  274         unsigned int mode = inode->i_mode;
  275 
  276         if (likely(uid_eq(current_fsuid(), inode->i_uid)))
  277                 mode >>= 6;
  278         else {
  279                 if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
  280                         int error = check_acl(inode, mask);
  281                         if (error != -EAGAIN)
  282                                 return error;
  283                 }
  284 
  285                 if (in_group_p(inode->i_gid))
  286                         mode >>= 3;
  287         }
  288 
  289         /*
  290          * If the DACs are ok we don't need any capability check.
  291          */
  292         if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
  293                 return 0;
  294         return -EACCES;
  295 }
  296 
  297 /**
  298  * generic_permission -  check for access rights on a Posix-like filesystem
  299  * @inode:      inode to check access rights for
  300  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
  301  *
  302  * Used to check for read/write/execute permissions on a file.
  303  * We use "fsuid" for this, letting us set arbitrary permissions
  304  * for filesystem access without changing the "normal" uids which
  305  * are used for other things.
  306  *
  307  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
  308  * request cannot be satisfied (eg. requires blocking or too much complexity).
  309  * It would then be called again in ref-walk mode.
  310  */
  311 int generic_permission(struct inode *inode, int mask)
  312 {
  313         int ret;
  314 
  315         /*
  316          * Do the basic permission checks.
  317          */
  318         ret = acl_permission_check(inode, mask);
  319         if (ret != -EACCES)
  320                 return ret;
  321 
  322         if (S_ISDIR(inode->i_mode)) {
  323                 /* DACs are overridable for directories */
  324                 if (inode_capable(inode, CAP_DAC_OVERRIDE))
  325                         return 0;
  326                 if (!(mask & MAY_WRITE))
  327                         if (inode_capable(inode, CAP_DAC_READ_SEARCH))
  328                                 return 0;
  329                 return -EACCES;
  330         }
  331         /*
  332          * Read/write DACs are always overridable.
  333          * Executable DACs are overridable when there is
  334          * at least one exec bit set.
  335          */
  336         if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
  337                 if (inode_capable(inode, CAP_DAC_OVERRIDE))
  338                         return 0;
  339 
  340         /*
  341          * Searching includes executable on directories, else just read.
  342          */
  343         mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
  344         if (mask == MAY_READ)
  345                 if (inode_capable(inode, CAP_DAC_READ_SEARCH))
  346                         return 0;
  347 
  348         return -EACCES;
  349 }
  350 
  351 /*
  352  * We _really_ want to just do "generic_permission()" without
  353  * even looking at the inode->i_op values. So we keep a cache
  354  * flag in inode->i_opflags, that says "this has not special
  355  * permission function, use the fast case".
  356  */
  357 static inline int do_inode_permission(struct inode *inode, int mask)
  358 {
  359         if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
  360                 if (likely(inode->i_op->permission))
  361                         return inode->i_op->permission(inode, mask);
  362 
  363                 /* This gets set once for the inode lifetime */
  364                 spin_lock(&inode->i_lock);
  365                 inode->i_opflags |= IOP_FASTPERM;
  366                 spin_unlock(&inode->i_lock);
  367         }
  368         return generic_permission(inode, mask);
  369 }
  370 
  371 /**
  372  * __inode_permission - Check for access rights to a given inode
  373  * @inode: Inode to check permission on
  374  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  375  *
  376  * Check for read/write/execute permissions on an inode.
  377  *
  378  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
  379  *
  380  * This does not check for a read-only file system.  You probably want
  381  * inode_permission().
  382  */
  383 int __inode_permission(struct inode *inode, int mask)
  384 {
  385         int retval;
  386 
  387         if (unlikely(mask & MAY_WRITE)) {
  388                 /*
  389                  * Nobody gets write access to an immutable file.
  390                  */
  391                 if (IS_IMMUTABLE(inode))
  392                         return -EACCES;
  393         }
  394 
  395         retval = do_inode_permission(inode, mask);
  396         if (retval)
  397                 return retval;
  398 
  399         retval = devcgroup_inode_permission(inode, mask);
  400         if (retval)
  401                 return retval;
  402 
  403         return security_inode_permission(inode, mask);
  404 }
  405 
  406 /**
  407  * sb_permission - Check superblock-level permissions
  408  * @sb: Superblock of inode to check permission on
  409  * @inode: Inode to check permission on
  410  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  411  *
  412  * Separate out file-system wide checks from inode-specific permission checks.
  413  */
  414 static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
  415 {
  416         if (unlikely(mask & MAY_WRITE)) {
  417                 umode_t mode = inode->i_mode;
  418 
  419                 /* Nobody gets write access to a read-only fs. */
  420                 if ((sb->s_flags & MS_RDONLY) &&
  421                     (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
  422                         return -EROFS;
  423         }
  424         return 0;
  425 }
  426 
  427 /**
  428  * inode_permission - Check for access rights to a given inode
  429  * @inode: Inode to check permission on
  430  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  431  *
  432  * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
  433  * this, letting us set arbitrary permissions for filesystem access without
  434  * changing the "normal" UIDs which are used for other things.
  435  *
  436  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
  437  */
  438 int inode_permission(struct inode *inode, int mask)
  439 {
  440         int retval;
  441 
  442         retval = sb_permission(inode->i_sb, inode, mask);
  443         if (retval)
  444                 return retval;
  445         return __inode_permission(inode, mask);
  446 }
  447 
  448 /**
  449  * path_get - get a reference to a path
  450  * @path: path to get the reference to
  451  *
  452  * Given a path increment the reference count to the dentry and the vfsmount.
  453  */
  454 void path_get(struct path *path)
  455 {
  456         mntget(path->mnt);
  457         dget(path->dentry);
  458 }
  459 EXPORT_SYMBOL(path_get);
  460 
  461 /**
  462  * path_put - put a reference to a path
  463  * @path: path to put the reference to
  464  *
  465  * Given a path decrement the reference count to the dentry and the vfsmount.
  466  */
  467 void path_put(struct path *path)
  468 {
  469         dput(path->dentry);
  470         mntput(path->mnt);
  471 }
  472 EXPORT_SYMBOL(path_put);
  473 
  474 /*
  475  * Path walking has 2 modes, rcu-walk and ref-walk (see
  476  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
  477  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
  478  * normal reference counts on dentries and vfsmounts to transition to rcu-walk
  479  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
  480  * got stuck, so ref-walk may continue from there. If this is not successful
  481  * (eg. a seqcount has changed), then failure is returned and it's up to caller
  482  * to restart the path walk from the beginning in ref-walk mode.
  483  */
  484 
  485 static inline void lock_rcu_walk(void)
  486 {
  487         br_read_lock(&vfsmount_lock);
  488         rcu_read_lock();
  489 }
  490 
  491 static inline void unlock_rcu_walk(void)
  492 {
  493         rcu_read_unlock();
  494         br_read_unlock(&vfsmount_lock);
  495 }
  496 
  497 /**
  498  * unlazy_walk - try to switch to ref-walk mode.
  499  * @nd: nameidata pathwalk data
  500  * @dentry: child of nd->path.dentry or NULL
  501  * Returns: 0 on success, -ECHILD on failure
  502  *
  503  * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
  504  * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
  505  * @nd or NULL.  Must be called from rcu-walk context.
  506  */
  507 static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
  508 {
  509         struct fs_struct *fs = current->fs;
  510         struct dentry *parent = nd->path.dentry;
  511         int want_root = 0;
  512 
  513         BUG_ON(!(nd->flags & LOOKUP_RCU));
  514         if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
  515                 want_root = 1;
  516                 spin_lock(&fs->lock);
  517                 if (nd->root.mnt != fs->root.mnt ||
  518                                 nd->root.dentry != fs->root.dentry)
  519                         goto err_root;
  520         }
  521         spin_lock(&parent->d_lock);
  522         if (!dentry) {
  523                 if (!__d_rcu_to_refcount(parent, nd->seq))
  524                         goto err_parent;
  525                 BUG_ON(nd->inode != parent->d_inode);
  526         } else {
  527                 if (dentry->d_parent != parent)
  528                         goto err_parent;
  529                 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
  530                 if (!__d_rcu_to_refcount(dentry, nd->seq))
  531                         goto err_child;
  532                 /*
  533                  * If the sequence check on the child dentry passed, then
  534                  * the child has not been removed from its parent. This
  535                  * means the parent dentry must be valid and able to take
  536                  * a reference at this point.
  537                  */
  538                 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
  539                 BUG_ON(!parent->d_count);
  540                 parent->d_count++;
  541                 spin_unlock(&dentry->d_lock);
  542         }
  543         spin_unlock(&parent->d_lock);
  544         if (want_root) {
  545                 path_get(&nd->root);
  546                 spin_unlock(&fs->lock);
  547         }
  548         mntget(nd->path.mnt);
  549 
  550         unlock_rcu_walk();
  551         nd->flags &= ~LOOKUP_RCU;
  552         return 0;
  553 
  554 err_child:
  555         spin_unlock(&dentry->d_lock);
  556 err_parent:
  557         spin_unlock(&parent->d_lock);
  558 err_root:
  559         if (want_root)
  560                 spin_unlock(&fs->lock);
  561         return -ECHILD;
  562 }
  563 
  564 static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
  565 {
  566         return dentry->d_op->d_revalidate(dentry, flags);
  567 }
  568 
  569 /**
  570  * complete_walk - successful completion of path walk
  571  * @nd:  pointer nameidata
  572  *
  573  * If we had been in RCU mode, drop out of it and legitimize nd->path.
  574  * Revalidate the final result, unless we'd already done that during
  575  * the path walk or the filesystem doesn't ask for it.  Return 0 on
  576  * success, -error on failure.  In case of failure caller does not
  577  * need to drop nd->path.
  578  */
  579 static int complete_walk(struct nameidata *nd)
  580 {
  581         struct dentry *dentry = nd->path.dentry;
  582         int status;
  583 
  584         if (nd->flags & LOOKUP_RCU) {
  585                 nd->flags &= ~LOOKUP_RCU;
  586                 if (!(nd->flags & LOOKUP_ROOT))
  587                         nd->root.mnt = NULL;
  588                 spin_lock(&dentry->d_lock);
  589                 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
  590                         spin_unlock(&dentry->d_lock);
  591                         unlock_rcu_walk();
  592                         return -ECHILD;
  593                 }
  594                 BUG_ON(nd->inode != dentry->d_inode);
  595                 spin_unlock(&dentry->d_lock);
  596                 mntget(nd->path.mnt);
  597                 unlock_rcu_walk();
  598         }
  599 
  600         if (likely(!(nd->flags & LOOKUP_JUMPED)))
  601                 return 0;
  602 
  603         if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
  604                 return 0;
  605 
  606         if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
  607                 return 0;
  608 
  609         /* Note: we do not d_invalidate() */
  610         status = d_revalidate(dentry, nd->flags);
  611         if (status > 0)
  612                 return 0;
  613 
  614         if (!status)
  615                 status = -ESTALE;
  616 
  617         path_put(&nd->path);
  618         return status;
  619 }
  620 
  621 static __always_inline void set_root(struct nameidata *nd)
  622 {
  623         if (!nd->root.mnt)
  624                 get_fs_root(current->fs, &nd->root);
  625 }
  626 
  627 static int link_path_walk(const char *, struct nameidata *);
  628 
  629 static __always_inline void set_root_rcu(struct nameidata *nd)
  630 {
  631         if (!nd->root.mnt) {
  632                 struct fs_struct *fs = current->fs;
  633                 unsigned seq;
  634 
  635                 do {
  636                         seq = read_seqcount_begin(&fs->seq);
  637                         nd->root = fs->root;
  638                         nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
  639                 } while (read_seqcount_retry(&fs->seq, seq));
  640         }
  641 }
  642 
  643 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
  644 {
  645         int ret;
  646 
  647         if (IS_ERR(link))
  648                 goto fail;
  649 
  650         if (*link == '/') {
  651                 set_root(nd);
  652                 path_put(&nd->path);
  653                 nd->path = nd->root;
  654                 path_get(&nd->root);
  655                 nd->flags |= LOOKUP_JUMPED;
  656         }
  657         nd->inode = nd->path.dentry->d_inode;
  658 
  659         ret = link_path_walk(link, nd);
  660         return ret;
  661 fail:
  662         path_put(&nd->path);
  663         return PTR_ERR(link);
  664 }
  665 
  666 static void path_put_conditional(struct path *path, struct nameidata *nd)
  667 {
  668         dput(path->dentry);
  669         if (path->mnt != nd->path.mnt)
  670                 mntput(path->mnt);
  671 }
  672 
  673 static inline void path_to_nameidata(const struct path *path,
  674                                         struct nameidata *nd)
  675 {
  676         if (!(nd->flags & LOOKUP_RCU)) {
  677                 dput(nd->path.dentry);
  678                 if (nd->path.mnt != path->mnt)
  679                         mntput(nd->path.mnt);
  680         }
  681         nd->path.mnt = path->mnt;
  682         nd->path.dentry = path->dentry;
  683 }
  684 
  685 /*
  686  * Helper to directly jump to a known parsed path from ->follow_link,
  687  * caller must have taken a reference to path beforehand.
  688  */
  689 void nd_jump_link(struct nameidata *nd, struct path *path)
  690 {
  691         path_put(&nd->path);
  692 
  693         nd->path = *path;
  694         nd->inode = nd->path.dentry->d_inode;
  695         nd->flags |= LOOKUP_JUMPED;
  696 
  697         BUG_ON(nd->inode->i_op->follow_link);
  698 }
  699 
  700 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
  701 {
  702         struct inode *inode = link->dentry->d_inode;
  703         if (inode->i_op->put_link)
  704                 inode->i_op->put_link(link->dentry, nd, cookie);
  705         path_put(link);
  706 }
  707 
  708 int sysctl_protected_symlinks __read_mostly = 0;
  709 int sysctl_protected_hardlinks __read_mostly = 0;
  710 
  711 /**
  712  * may_follow_link - Check symlink following for unsafe situations
  713  * @link: The path of the symlink
  714  * @nd: nameidata pathwalk data
  715  *
  716  * In the case of the sysctl_protected_symlinks sysctl being enabled,
  717  * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
  718  * in a sticky world-writable directory. This is to protect privileged
  719  * processes from failing races against path names that may change out
  720  * from under them by way of other users creating malicious symlinks.
  721  * It will permit symlinks to be followed only when outside a sticky
  722  * world-writable directory, or when the uid of the symlink and follower
  723  * match, or when the directory owner matches the symlink's owner.
  724  *
  725  * Returns 0 if following the symlink is allowed, -ve on error.
  726  */
  727 static inline int may_follow_link(struct path *link, struct nameidata *nd)
  728 {
  729         const struct inode *inode;
  730         const struct inode *parent;
  731 
  732         if (!sysctl_protected_symlinks)
  733                 return 0;
  734 
  735         /* Allowed if owner and follower match. */
  736         inode = link->dentry->d_inode;
  737         if (uid_eq(current_cred()->fsuid, inode->i_uid))
  738                 return 0;
  739 
  740         /* Allowed if parent directory not sticky and world-writable. */
  741         parent = nd->path.dentry->d_inode;
  742         if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
  743                 return 0;
  744 
  745         /* Allowed if parent directory and link owner match. */
  746         if (uid_eq(parent->i_uid, inode->i_uid))
  747                 return 0;
  748 
  749         audit_log_link_denied("follow_link", link);
  750         path_put_conditional(link, nd);
  751         path_put(&nd->path);
  752         return -EACCES;
  753 }
  754 
  755 /**
  756  * safe_hardlink_source - Check for safe hardlink conditions
  757  * @inode: the source inode to hardlink from
  758  *
  759  * Return false if at least one of the following conditions:
  760  *    - inode is not a regular file
  761  *    - inode is setuid
  762  *    - inode is setgid and group-exec
  763  *    - access failure for read and write
  764  *
  765  * Otherwise returns true.
  766  */
  767 static bool safe_hardlink_source(struct inode *inode)
  768 {
  769         umode_t mode = inode->i_mode;
  770 
  771         /* Special files should not get pinned to the filesystem. */
  772         if (!S_ISREG(mode))
  773                 return false;
  774 
  775         /* Setuid files should not get pinned to the filesystem. */
  776         if (mode & S_ISUID)
  777                 return false;
  778 
  779         /* Executable setgid files should not get pinned to the filesystem. */
  780         if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
  781                 return false;
  782 
  783         /* Hardlinking to unreadable or unwritable sources is dangerous. */
  784         if (inode_permission(inode, MAY_READ | MAY_WRITE))
  785                 return false;
  786 
  787         return true;
  788 }
  789 
  790 /**
  791  * may_linkat - Check permissions for creating a hardlink
  792  * @link: the source to hardlink from
  793  *
  794  * Block hardlink when all of:
  795  *  - sysctl_protected_hardlinks enabled
  796  *  - fsuid does not match inode
  797  *  - hardlink source is unsafe (see safe_hardlink_source() above)
  798  *  - not CAP_FOWNER
  799  *
  800  * Returns 0 if successful, -ve on error.
  801  */
  802 static int may_linkat(struct path *link)
  803 {
  804         const struct cred *cred;
  805         struct inode *inode;
  806 
  807         if (!sysctl_protected_hardlinks)
  808                 return 0;
  809 
  810         cred = current_cred();
  811         inode = link->dentry->d_inode;
  812 
  813         /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
  814          * otherwise, it must be a safe source.
  815          */
  816         if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
  817             capable(CAP_FOWNER))
  818                 return 0;
  819 
  820         audit_log_link_denied("linkat", link);
  821         return -EPERM;
  822 }
  823 
  824 static __always_inline int
  825 follow_link(struct path *link, struct nameidata *nd, void **p)
  826 {
  827         struct dentry *dentry = link->dentry;
  828         int error;
  829         char *s;
  830 
  831         BUG_ON(nd->flags & LOOKUP_RCU);
  832 
  833         if (link->mnt == nd->path.mnt)
  834                 mntget(link->mnt);
  835 
  836         error = -ELOOP;
  837         if (unlikely(current->total_link_count >= 40))
  838                 goto out_put_nd_path;
  839 
  840         cond_resched();
  841         current->total_link_count++;
  842 
  843         touch_atime(link);
  844         nd_set_link(nd, NULL);
  845 
  846         error = security_inode_follow_link(link->dentry, nd);
  847         if (error)
  848                 goto out_put_nd_path;
  849 
  850         nd->last_type = LAST_BIND;
  851         *p = dentry->d_inode->i_op->follow_link(dentry, nd);
  852         error = PTR_ERR(*p);
  853         if (IS_ERR(*p))
  854                 goto out_put_nd_path;
  855 
  856         error = 0;
  857         s = nd_get_link(nd);
  858         if (s) {
  859                 error = __vfs_follow_link(nd, s);
  860                 if (unlikely(error))
  861                         put_link(nd, link, *p);
  862         }
  863 
  864         return error;
  865 
  866 out_put_nd_path:
  867         *p = NULL;
  868         path_put(&nd->path);
  869         path_put(link);
  870         return error;
  871 }
  872 
  873 static int follow_up_rcu(struct path *path)
  874 {
  875         struct mount *mnt = real_mount(path->mnt);
  876         struct mount *parent;
  877         struct dentry *mountpoint;
  878 
  879         parent = mnt->mnt_parent;
  880         if (&parent->mnt == path->mnt)
  881                 return 0;
  882         mountpoint = mnt->mnt_mountpoint;
  883         path->dentry = mountpoint;
  884         path->mnt = &parent->mnt;
  885         return 1;
  886 }
  887 
  888 /*
  889  * follow_up - Find the mountpoint of path's vfsmount
  890  *
  891  * Given a path, find the mountpoint of its source file system.
  892  * Replace @path with the path of the mountpoint in the parent mount.
  893  * Up is towards /.
  894  *
  895  * Return 1 if we went up a level and 0 if we were already at the
  896  * root.
  897  */
  898 int follow_up(struct path *path)
  899 {
  900         struct mount *mnt = real_mount(path->mnt);
  901         struct mount *parent;
  902         struct dentry *mountpoint;
  903 
  904         br_read_lock(&vfsmount_lock);
  905         parent = mnt->mnt_parent;
  906         if (parent == mnt) {
  907                 br_read_unlock(&vfsmount_lock);
  908                 return 0;
  909         }
  910         mntget(&parent->mnt);
  911         mountpoint = dget(mnt->mnt_mountpoint);
  912         br_read_unlock(&vfsmount_lock);
  913         dput(path->dentry);
  914         path->dentry = mountpoint;
  915         mntput(path->mnt);
  916         path->mnt = &parent->mnt;
  917         return 1;
  918 }
  919 
  920 /*
  921  * Perform an automount
  922  * - return -EISDIR to tell follow_managed() to stop and return the path we
  923  *   were called with.
  924  */
  925 static int follow_automount(struct path *path, unsigned flags,
  926                             bool *need_mntput)
  927 {
  928         struct vfsmount *mnt;
  929         int err;
  930 
  931         if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
  932                 return -EREMOTE;
  933 
  934         /* We don't want to mount if someone's just doing a stat -
  935          * unless they're stat'ing a directory and appended a '/' to
  936          * the name.
  937          *
  938          * We do, however, want to mount if someone wants to open or
  939          * create a file of any type under the mountpoint, wants to
  940          * traverse through the mountpoint or wants to open the
  941          * mounted directory.  Also, autofs may mark negative dentries
  942          * as being automount points.  These will need the attentions
  943          * of the daemon to instantiate them before they can be used.
  944          */
  945         if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
  946                      LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
  947             path->dentry->d_inode)
  948                 return -EISDIR;
  949 
  950         current->total_link_count++;
  951         if (current->total_link_count >= 40)
  952                 return -ELOOP;
  953 
  954         mnt = path->dentry->d_op->d_automount(path);
  955         if (IS_ERR(mnt)) {
  956                 /*
  957                  * The filesystem is allowed to return -EISDIR here to indicate
  958                  * it doesn't want to automount.  For instance, autofs would do
  959                  * this so that its userspace daemon can mount on this dentry.
  960                  *
  961                  * However, we can only permit this if it's a terminal point in
  962                  * the path being looked up; if it wasn't then the remainder of
  963                  * the path is inaccessible and we should say so.
  964                  */
  965                 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
  966                         return -EREMOTE;
  967                 return PTR_ERR(mnt);
  968         }
  969 
  970         if (!mnt) /* mount collision */
  971                 return 0;
  972 
  973         if (!*need_mntput) {
  974                 /* lock_mount() may release path->mnt on error */
  975                 mntget(path->mnt);
  976                 *need_mntput = true;
  977         }
  978         err = finish_automount(mnt, path);
  979 
  980         switch (err) {
  981         case -EBUSY:
  982                 /* Someone else made a mount here whilst we were busy */
  983                 return 0;
  984         case 0:
  985                 path_put(path);
  986                 path->mnt = mnt;
  987                 path->dentry = dget(mnt->mnt_root);
  988                 return 0;
  989         default:
  990                 return err;
  991         }
  992 
  993 }
  994 
  995 /*
  996  * Handle a dentry that is managed in some way.
  997  * - Flagged for transit management (autofs)
  998  * - Flagged as mountpoint
  999  * - Flagged as automount point
 1000  *
 1001  * This may only be called in refwalk mode.
 1002  *
 1003  * Serialization is taken care of in namespace.c
 1004  */
 1005 static int follow_managed(struct path *path, unsigned flags)
 1006 {
 1007         struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
 1008         unsigned managed;
 1009         bool need_mntput = false;
 1010         int ret = 0;
 1011 
 1012         /* Given that we're not holding a lock here, we retain the value in a
 1013          * local variable for each dentry as we look at it so that we don't see
 1014          * the components of that value change under us */
 1015         while (managed = ACCESS_ONCE(path->dentry->d_flags),
 1016                managed &= DCACHE_MANAGED_DENTRY,
 1017                unlikely(managed != 0)) {
 1018                 /* Allow the filesystem to manage the transit without i_mutex
 1019                  * being held. */
 1020                 if (managed & DCACHE_MANAGE_TRANSIT) {
 1021                         BUG_ON(!path->dentry->d_op);
 1022                         BUG_ON(!path->dentry->d_op->d_manage);
 1023                         ret = path->dentry->d_op->d_manage(path->dentry, false);
 1024                         if (ret < 0)
 1025                                 break;
 1026                 }
 1027 
 1028                 /* Transit to a mounted filesystem. */
 1029                 if (managed & DCACHE_MOUNTED) {
 1030                         struct vfsmount *mounted = lookup_mnt(path);
 1031                         if (mounted) {
 1032                                 dput(path->dentry);
 1033                                 if (need_mntput)
 1034                                         mntput(path->mnt);
 1035                                 path->mnt = mounted;
 1036                                 path->dentry = dget(mounted->mnt_root);
 1037                                 need_mntput = true;
 1038                                 continue;
 1039                         }
 1040 
 1041                         /* Something is mounted on this dentry in another
 1042                          * namespace and/or whatever was mounted there in this
 1043                          * namespace got unmounted before we managed to get the
 1044                          * vfsmount_lock */
 1045                 }
 1046 
 1047                 /* Handle an automount point */
 1048                 if (managed & DCACHE_NEED_AUTOMOUNT) {
 1049                         ret = follow_automount(path, flags, &need_mntput);
 1050                         if (ret < 0)
 1051                                 break;
 1052                         continue;
 1053                 }
 1054 
 1055                 /* We didn't change the current path point */
 1056                 break;
 1057         }
 1058 
 1059         if (need_mntput && path->mnt == mnt)
 1060                 mntput(path->mnt);
 1061         if (ret == -EISDIR)
 1062                 ret = 0;
 1063         return ret < 0 ? ret : need_mntput;
 1064 }
 1065 
 1066 int follow_down_one(struct path *path)
 1067 {
 1068         struct vfsmount *mounted;
 1069 
 1070         mounted = lookup_mnt(path);
 1071         if (mounted) {
 1072                 dput(path->dentry);
 1073                 mntput(path->mnt);
 1074                 path->mnt = mounted;
 1075                 path->dentry = dget(mounted->mnt_root);
 1076                 return 1;
 1077         }
 1078         return 0;
 1079 }
 1080 
 1081 static inline bool managed_dentry_might_block(struct dentry *dentry)
 1082 {
 1083         return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
 1084                 dentry->d_op->d_manage(dentry, true) < 0);
 1085 }
 1086 
 1087 /*
 1088  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 1089  * we meet a managed dentry that would need blocking.
 1090  */
 1091 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 1092                                struct inode **inode)
 1093 {
 1094         for (;;) {
 1095                 struct mount *mounted;
 1096                 /*
 1097                  * Don't forget we might have a non-mountpoint managed dentry
 1098                  * that wants to block transit.
 1099                  */
 1100                 if (unlikely(managed_dentry_might_block(path->dentry)))
 1101                         return false;
 1102 
 1103                 if (!d_mountpoint(path->dentry))
 1104                         break;
 1105 
 1106                 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 1107                 if (!mounted)
 1108                         break;
 1109                 path->mnt = &mounted->mnt;
 1110                 path->dentry = mounted->mnt.mnt_root;
 1111                 nd->flags |= LOOKUP_JUMPED;
 1112                 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
 1113                 /*
 1114                  * Update the inode too. We don't need to re-check the
 1115                  * dentry sequence number here after this d_inode read,
 1116                  * because a mount-point is always pinned.
 1117                  */
 1118                 *inode = path->dentry->d_inode;
 1119         }
 1120         return true;
 1121 }
 1122 
 1123 static void follow_mount_rcu(struct nameidata *nd)
 1124 {
 1125         while (d_mountpoint(nd->path.dentry)) {
 1126                 struct mount *mounted;
 1127                 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
 1128                 if (!mounted)
 1129                         break;
 1130                 nd->path.mnt = &mounted->mnt;
 1131                 nd->path.dentry = mounted->mnt.mnt_root;
 1132                 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 1133         }
 1134 }
 1135 
 1136 static int follow_dotdot_rcu(struct nameidata *nd)
 1137 {
 1138         set_root_rcu(nd);
 1139 
 1140         while (1) {
 1141                 if (nd->path.dentry == nd->root.dentry &&
 1142                     nd->path.mnt == nd->root.mnt) {
 1143                         break;
 1144                 }
 1145                 if (nd->path.dentry != nd->path.mnt->mnt_root) {
 1146                         struct dentry *old = nd->path.dentry;
 1147                         struct dentry *parent = old->d_parent;
 1148                         unsigned seq;
 1149 
 1150                         seq = read_seqcount_begin(&parent->d_seq);
 1151                         if (read_seqcount_retry(&old->d_seq, nd->seq))
 1152                                 goto failed;
 1153                         nd->path.dentry = parent;
 1154                         nd->seq = seq;
 1155                         break;
 1156                 }
 1157                 if (!follow_up_rcu(&nd->path))
 1158                         break;
 1159                 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 1160         }
 1161         follow_mount_rcu(nd);
 1162         nd->inode = nd->path.dentry->d_inode;
 1163         return 0;
 1164 
 1165 failed:
 1166         nd->flags &= ~LOOKUP_RCU;
 1167         if (!(nd->flags & LOOKUP_ROOT))
 1168                 nd->root.mnt = NULL;
 1169         unlock_rcu_walk();
 1170         return -ECHILD;
 1171 }
 1172 
 1173 /*
 1174  * Follow down to the covering mount currently visible to userspace.  At each
 1175  * point, the filesystem owning that dentry may be queried as to whether the
 1176  * caller is permitted to proceed or not.
 1177  */
 1178 int follow_down(struct path *path)
 1179 {
 1180         unsigned managed;
 1181         int ret;
 1182 
 1183         while (managed = ACCESS_ONCE(path->dentry->d_flags),
 1184                unlikely(managed & DCACHE_MANAGED_DENTRY)) {
 1185                 /* Allow the filesystem to manage the transit without i_mutex
 1186                  * being held.
 1187                  *
 1188                  * We indicate to the filesystem if someone is trying to mount
 1189                  * something here.  This gives autofs the chance to deny anyone
 1190                  * other than its daemon the right to mount on its
 1191                  * superstructure.
 1192                  *
 1193                  * The filesystem may sleep at this point.
 1194                  */
 1195                 if (managed & DCACHE_MANAGE_TRANSIT) {
 1196                         BUG_ON(!path->dentry->d_op);
 1197                         BUG_ON(!path->dentry->d_op->d_manage);
 1198                         ret = path->dentry->d_op->d_manage(
 1199                                 path->dentry, false);
 1200                         if (ret < 0)
 1201                                 return ret == -EISDIR ? 0 : ret;
 1202                 }
 1203 
 1204                 /* Transit to a mounted filesystem. */
 1205                 if (managed & DCACHE_MOUNTED) {
 1206                         struct vfsmount *mounted = lookup_mnt(path);
 1207                         if (!mounted)
 1208                                 break;
 1209                         dput(path->dentry);
 1210                         mntput(path->mnt);
 1211                         path->mnt = mounted;
 1212                         path->dentry = dget(mounted->mnt_root);
 1213                         continue;
 1214                 }
 1215 
 1216                 /* Don't handle automount points here */
 1217                 break;
 1218         }
 1219         return 0;
 1220 }
 1221 
 1222 /*
 1223  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
 1224  */
 1225 static void follow_mount(struct path *path)
 1226 {
 1227         while (d_mountpoint(path->dentry)) {
 1228                 struct vfsmount *mounted = lookup_mnt(path);
 1229                 if (!mounted)
 1230                         break;
 1231                 dput(path->dentry);
 1232                 mntput(path->mnt);
 1233                 path->mnt = mounted;
 1234                 path->dentry = dget(mounted->mnt_root);
 1235         }
 1236 }
 1237 
 1238 static void follow_dotdot(struct nameidata *nd)
 1239 {
 1240         set_root(nd);
 1241 
 1242         while(1) {
 1243                 struct dentry *old = nd->path.dentry;
 1244 
 1245                 if (nd->path.dentry == nd->root.dentry &&
 1246                     nd->path.mnt == nd->root.mnt) {
 1247                         break;
 1248                 }
 1249                 if (nd->path.dentry != nd->path.mnt->mnt_root) {
 1250                         /* rare case of legitimate dget_parent()... */
 1251                         nd->path.dentry = dget_parent(nd->path.dentry);
 1252                         dput(old);
 1253                         break;
 1254                 }
 1255                 if (!follow_up(&nd->path))
 1256                         break;
 1257         }
 1258         follow_mount(&nd->path);
 1259         nd->inode = nd->path.dentry->d_inode;
 1260 }
 1261 
 1262 /*
 1263  * This looks up the name in dcache, possibly revalidates the old dentry and
 1264  * allocates a new one if not found or not valid.  In the need_lookup argument
 1265  * returns whether i_op->lookup is necessary.
 1266  *
 1267  * dir->d_inode->i_mutex must be held
 1268  */
 1269 static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
 1270                                     unsigned int flags, bool *need_lookup)
 1271 {
 1272         struct dentry *dentry;
 1273         int error;
 1274 
 1275         *need_lookup = false;
 1276         dentry = d_lookup(dir, name);
 1277         if (dentry) {
 1278                 if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
 1279                         error = d_revalidate(dentry, flags);
 1280                         if (unlikely(error <= 0)) {
 1281                                 if (error < 0) {
 1282                                         dput(dentry);
 1283                                         return ERR_PTR(error);
 1284                                 } else if (!d_invalidate(dentry)) {
 1285                                         dput(dentry);
 1286                                         dentry = NULL;
 1287                                 }
 1288                         }
 1289                 }
 1290         }
 1291 
 1292         if (!dentry) {
 1293                 dentry = d_alloc(dir, name);
 1294                 if (unlikely(!dentry))
 1295                         return ERR_PTR(-ENOMEM);
 1296 
 1297                 *need_lookup = true;
 1298         }
 1299         return dentry;
 1300 }
 1301 
 1302 /*
 1303  * Call i_op->lookup on the dentry.  The dentry must be negative but may be
 1304  * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
 1305  *
 1306  * dir->d_inode->i_mutex must be held
 1307  */
 1308 static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
 1309                                   unsigned int flags)
 1310 {
 1311         struct dentry *old;
 1312 
 1313         /* Don't create child dentry for a dead directory. */
 1314         if (unlikely(IS_DEADDIR(dir))) {
 1315                 dput(dentry);
 1316                 return ERR_PTR(-ENOENT);
 1317         }
 1318 
 1319         old = dir->i_op->lookup(dir, dentry, flags);
 1320         if (unlikely(old)) {
 1321                 dput(dentry);
 1322                 dentry = old;
 1323         }
 1324         return dentry;
 1325 }
 1326 
 1327 static struct dentry *__lookup_hash(struct qstr *name,
 1328                 struct dentry *base, unsigned int flags)
 1329 {
 1330         bool need_lookup;
 1331         struct dentry *dentry;
 1332 
 1333         dentry = lookup_dcache(name, base, flags, &need_lookup);
 1334         if (!need_lookup)
 1335                 return dentry;
 1336 
 1337         return lookup_real(base->d_inode, dentry, flags);
 1338 }
 1339 
 1340 /*
 1341  *  It's more convoluted than I'd like it to be, but... it's still fairly
 1342  *  small and for now I'd prefer to have fast path as straight as possible.
 1343  *  It _is_ time-critical.
 1344  */
 1345 static int lookup_fast(struct nameidata *nd, struct qstr *name,
 1346                        struct path *path, struct inode **inode)
 1347 {
 1348         struct vfsmount *mnt = nd->path.mnt;
 1349         struct dentry *dentry, *parent = nd->path.dentry;
 1350         int need_reval = 1;
 1351         int status = 1;
 1352         int err;
 1353 
 1354         /*
 1355          * Rename seqlock is not required here because in the off chance
 1356          * of a false negative due to a concurrent rename, we're going to
 1357          * do the non-racy lookup, below.
 1358          */
 1359         if (nd->flags & LOOKUP_RCU) {
 1360                 unsigned seq;
 1361                 dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
 1362                 if (!dentry)
 1363                         goto unlazy;
 1364 
 1365                 /*
 1366                  * This sequence count validates that the inode matches
 1367                  * the dentry name information from lookup.
 1368                  */
 1369                 *inode = dentry->d_inode;
 1370                 if (read_seqcount_retry(&dentry->d_seq, seq))
 1371                         return -ECHILD;
 1372 
 1373                 /*
 1374                  * This sequence count validates that the parent had no
 1375                  * changes while we did the lookup of the dentry above.
 1376                  *
 1377                  * The memory barrier in read_seqcount_begin of child is
 1378                  *  enough, we can use __read_seqcount_retry here.
 1379                  */
 1380                 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
 1381                         return -ECHILD;
 1382                 nd->seq = seq;
 1383 
 1384                 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
 1385                         status = d_revalidate(dentry, nd->flags);
 1386                         if (unlikely(status <= 0)) {
 1387                                 if (status != -ECHILD)
 1388                                         need_reval = 0;
 1389                                 goto unlazy;
 1390                         }
 1391                 }
 1392                 path->mnt = mnt;
 1393                 path->dentry = dentry;
 1394                 if (unlikely(!__follow_mount_rcu(nd, path, inode)))
 1395                         goto unlazy;
 1396                 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
 1397                         goto unlazy;
 1398                 return 0;
 1399 unlazy:
 1400                 if (unlazy_walk(nd, dentry))
 1401                         return -ECHILD;
 1402         } else {
 1403                 dentry = __d_lookup(parent, name);
 1404         }
 1405 
 1406         if (unlikely(!dentry))
 1407                 goto need_lookup;
 1408 
 1409         if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
 1410                 status = d_revalidate(dentry, nd->flags);
 1411         if (unlikely(status <= 0)) {
 1412                 if (status < 0) {
 1413                         dput(dentry);
 1414                         return status;
 1415                 }
 1416                 if (!d_invalidate(dentry)) {
 1417                         dput(dentry);
 1418                         goto need_lookup;
 1419                 }
 1420         }
 1421 
 1422         path->mnt = mnt;
 1423         path->dentry = dentry;
 1424         err = follow_managed(path, nd->flags);
 1425         if (unlikely(err < 0)) {
 1426                 path_put_conditional(path, nd);
 1427                 return err;
 1428         }
 1429         if (err)
 1430                 nd->flags |= LOOKUP_JUMPED;
 1431         *inode = path->dentry->d_inode;
 1432         return 0;
 1433 
 1434 need_lookup:
 1435         return 1;
 1436 }
 1437 
 1438 /* Fast lookup failed, do it the slow way */
 1439 static int lookup_slow(struct nameidata *nd, struct qstr *name,
 1440                        struct path *path)
 1441 {
 1442         struct dentry *dentry, *parent;
 1443         int err;
 1444 
 1445         parent = nd->path.dentry;
 1446         BUG_ON(nd->inode != parent->d_inode);
 1447 
 1448         mutex_lock(&parent->d_inode->i_mutex);
 1449         dentry = __lookup_hash(name, parent, nd->flags);
 1450         mutex_unlock(&parent->d_inode->i_mutex);
 1451         if (IS_ERR(dentry))
 1452                 return PTR_ERR(dentry);
 1453         path->mnt = nd->path.mnt;
 1454         path->dentry = dentry;
 1455         err = follow_managed(path, nd->flags);
 1456         if (unlikely(err < 0)) {
 1457                 path_put_conditional(path, nd);
 1458                 return err;
 1459         }
 1460         if (err)
 1461                 nd->flags |= LOOKUP_JUMPED;
 1462         return 0;
 1463 }
 1464 
 1465 static inline int may_lookup(struct nameidata *nd)
 1466 {
 1467         if (nd->flags & LOOKUP_RCU) {
 1468                 int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
 1469                 if (err != -ECHILD)
 1470                         return err;
 1471                 if (unlazy_walk(nd, NULL))
 1472                         return -ECHILD;
 1473         }
 1474         return inode_permission(nd->inode, MAY_EXEC);
 1475 }
 1476 
 1477 static inline int handle_dots(struct nameidata *nd, int type)
 1478 {
 1479         if (type == LAST_DOTDOT) {
 1480                 if (nd->flags & LOOKUP_RCU) {
 1481                         if (follow_dotdot_rcu(nd))
 1482                                 return -ECHILD;
 1483                 } else
 1484                         follow_dotdot(nd);
 1485         }
 1486         return 0;
 1487 }
 1488 
 1489 static void terminate_walk(struct nameidata *nd)
 1490 {
 1491         if (!(nd->flags & LOOKUP_RCU)) {
 1492                 path_put(&nd->path);
 1493         } else {
 1494                 nd->flags &= ~LOOKUP_RCU;
 1495                 if (!(nd->flags & LOOKUP_ROOT))
 1496                         nd->root.mnt = NULL;
 1497                 unlock_rcu_walk();
 1498         }
 1499 }
 1500 
 1501 /*
 1502  * Do we need to follow links? We _really_ want to be able
 1503  * to do this check without having to look at inode->i_op,
 1504  * so we keep a cache of "no, this doesn't need follow_link"
 1505  * for the common case.
 1506  */
 1507 static inline int should_follow_link(struct inode *inode, int follow)
 1508 {
 1509         if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
 1510                 if (likely(inode->i_op->follow_link))
 1511                         return follow;
 1512 
 1513                 /* This gets set once for the inode lifetime */
 1514                 spin_lock(&inode->i_lock);
 1515                 inode->i_opflags |= IOP_NOFOLLOW;
 1516                 spin_unlock(&inode->i_lock);
 1517         }
 1518         return 0;
 1519 }
 1520 
 1521 static inline int walk_component(struct nameidata *nd, struct path *path,
 1522                 struct qstr *name, int type, int follow)
 1523 {
 1524         struct inode *inode;
 1525         int err;
 1526         /*
 1527          * "." and ".." are special - ".." especially so because it has
 1528          * to be able to know about the current root directory and
 1529          * parent relationships.
 1530          */
 1531         if (unlikely(type != LAST_NORM))
 1532                 return handle_dots(nd, type);
 1533         err = lookup_fast(nd, name, path, &inode);
 1534         if (unlikely(err)) {
 1535                 if (err < 0)
 1536                         goto out_err;
 1537 
 1538                 err = lookup_slow(nd, name, path);
 1539                 if (err < 0)
 1540                         goto out_err;
 1541 
 1542                 inode = path->dentry->d_inode;
 1543         }
 1544         err = -ENOENT;
 1545         if (!inode)
 1546                 goto out_path_put;
 1547 
 1548         if (should_follow_link(inode, follow)) {
 1549                 if (nd->flags & LOOKUP_RCU) {
 1550                         if (unlikely(unlazy_walk(nd, path->dentry))) {
 1551                                 err = -ECHILD;
 1552                                 goto out_err;
 1553                         }
 1554                 }
 1555                 BUG_ON(inode != path->dentry->d_inode);
 1556                 return 1;
 1557         }
 1558         path_to_nameidata(path, nd);
 1559         nd->inode = inode;
 1560         return 0;
 1561 
 1562 out_path_put:
 1563         path_to_nameidata(path, nd);
 1564 out_err:
 1565         terminate_walk(nd);
 1566         return err;
 1567 }
 1568 
 1569 /*
 1570  * This limits recursive symlink follows to 8, while
 1571  * limiting consecutive symlinks to 40.
 1572  *
 1573  * Without that kind of total limit, nasty chains of consecutive
 1574  * symlinks can cause almost arbitrarily long lookups.
 1575  */
 1576 static inline int nested_symlink(struct path *path, struct nameidata *nd)
 1577 {
 1578         int res;
 1579 
 1580         if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
 1581                 path_put_conditional(path, nd);
 1582                 path_put(&nd->path);
 1583                 return -ELOOP;
 1584         }
 1585         BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 1586 
 1587         nd->depth++;
 1588         current->link_count++;
 1589 
 1590         do {
 1591                 struct path link = *path;
 1592                 void *cookie;
 1593 
 1594                 res = follow_link(&link, nd, &cookie);
 1595                 if (res)
 1596                         break;
 1597                 res = walk_component(nd, path, &nd->last,
 1598                                      nd->last_type, LOOKUP_FOLLOW);
 1599                 put_link(nd, &link, cookie);
 1600         } while (res > 0);
 1601 
 1602         current->link_count--;
 1603         nd->depth--;
 1604         return res;
 1605 }
 1606 
 1607 /*
 1608  * We really don't want to look at inode->i_op->lookup
 1609  * when we don't have to. So we keep a cache bit in
 1610  * the inode ->i_opflags field that says "yes, we can
 1611  * do lookup on this inode".
 1612  */
 1613 static inline int can_lookup(struct inode *inode)
 1614 {
 1615         if (likely(inode->i_opflags & IOP_LOOKUP))
 1616                 return 1;
 1617         if (likely(!inode->i_op->lookup))
 1618                 return 0;
 1619 
 1620         /* We do this once for the lifetime of the inode */
 1621         spin_lock(&inode->i_lock);
 1622         inode->i_opflags |= IOP_LOOKUP;
 1623         spin_unlock(&inode->i_lock);
 1624         return 1;
 1625 }
 1626 
 1627 /*
 1628  * We can do the critical dentry name comparison and hashing
 1629  * operations one word at a time, but we are limited to:
 1630  *
 1631  * - Architectures with fast unaligned word accesses. We could
 1632  *   do a "get_unaligned()" if this helps and is sufficiently
 1633  *   fast.
 1634  *
 1635  * - Little-endian machines (so that we can generate the mask
 1636  *   of low bytes efficiently). Again, we *could* do a byte
 1637  *   swapping load on big-endian architectures if that is not
 1638  *   expensive enough to make the optimization worthless.
 1639  *
 1640  * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 1641  *   do not trap on the (extremely unlikely) case of a page
 1642  *   crossing operation.
 1643  *
 1644  * - Furthermore, we need an efficient 64-bit compile for the
 1645  *   64-bit case in order to generate the "number of bytes in
 1646  *   the final mask". Again, that could be replaced with a
 1647  *   efficient population count instruction or similar.
 1648  */
 1649 #ifdef CONFIG_DCACHE_WORD_ACCESS
 1650 
 1651 #include <asm/word-at-a-time.h>
 1652 
 1653 #ifdef CONFIG_64BIT
 1654 
 1655 static inline unsigned int fold_hash(unsigned long hash)
 1656 {
 1657         hash += hash >> (8*sizeof(int));
 1658         return hash;
 1659 }
 1660 
 1661 #else   /* 32-bit case */
 1662 
 1663 #define fold_hash(x) (x)
 1664 
 1665 #endif
 1666 
 1667 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 1668 {
 1669         unsigned long a, mask;
 1670         unsigned long hash = 0;
 1671 
 1672         for (;;) {
 1673                 a = load_unaligned_zeropad(name);
 1674                 if (len < sizeof(unsigned long))
 1675                         break;
 1676                 hash += a;
 1677                 hash *= 9;
 1678                 name += sizeof(unsigned long);
 1679                 len -= sizeof(unsigned long);
 1680                 if (!len)
 1681                         goto done;
 1682         }
 1683         mask = ~(~0ul << len*8);
 1684         hash += mask & a;
 1685 done:
 1686         return fold_hash(hash);
 1687 }
 1688 EXPORT_SYMBOL(full_name_hash);
 1689 
 1690 /*
 1691  * Calculate the length and hash of the path component, and
 1692  * return the length of the component;
 1693  */
 1694 static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 1695 {
 1696         unsigned long a, b, adata, bdata, mask, hash, len;
 1697         const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 1698 
 1699         hash = a = 0;
 1700         len = -sizeof(unsigned long);
 1701         do {
 1702                 hash = (hash + a) * 9;
 1703                 len += sizeof(unsigned long);
 1704                 a = load_unaligned_zeropad(name+len);
 1705                 b = a ^ REPEAT_BYTE('/');
 1706         } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
 1707 
 1708         adata = prep_zero_mask(a, adata, &constants);
 1709         bdata = prep_zero_mask(b, bdata, &constants);
 1710 
 1711         mask = create_zero_mask(adata | bdata);
 1712 
 1713         hash += a & zero_bytemask(mask);
 1714         *hashp = fold_hash(hash);
 1715 
 1716         return len + find_zero(mask);
 1717 }
 1718 
 1719 #else
 1720 
 1721 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 1722 {
 1723         unsigned long hash = init_name_hash();
 1724         while (len--)
 1725                 hash = partial_name_hash(*name++, hash);
 1726         return end_name_hash(hash);
 1727 }
 1728 EXPORT_SYMBOL(full_name_hash);
 1729 
 1730 /*
 1731  * We know there's a real path component here of at least
 1732  * one character.
 1733  */
 1734 static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 1735 {
 1736         unsigned long hash = init_name_hash();
 1737         unsigned long len = 0, c;
 1738 
 1739         c = (unsigned char)*name;
 1740         do {
 1741                 len++;
 1742                 hash = partial_name_hash(c, hash);
 1743                 c = (unsigned char)name[len];
 1744         } while (c && c != '/');
 1745         *hashp = end_name_hash(hash);
 1746         return len;
 1747 }
 1748 
 1749 #endif
 1750 
 1751 /*
 1752  * Name resolution.
 1753  * This is the basic name resolution function, turning a pathname into
 1754  * the final dentry. We expect 'base' to be positive and a directory.
 1755  *
 1756  * Returns 0 and nd will have valid dentry and mnt on success.
 1757  * Returns error and drops reference to input namei data on failure.
 1758  */
 1759 static int link_path_walk(const char *name, struct nameidata *nd)
 1760 {
 1761         struct path next;
 1762         int err;
 1763         
 1764         while (*name=='/')
 1765                 name++;
 1766         if (!*name)
 1767                 return 0;
 1768 
 1769         /* At this point we know we have a real path component. */
 1770         for(;;) {
 1771                 struct qstr this;
 1772                 long len;
 1773                 int type;
 1774 
 1775                 err = may_lookup(nd);
 1776                 if (err)
 1777                         break;
 1778 
 1779                 len = hash_name(name, &this.hash);
 1780                 this.name = name;
 1781                 this.len = len;
 1782 
 1783                 type = LAST_NORM;
 1784                 if (name[0] == '.') switch (len) {
 1785                         case 2:
 1786                                 if (name[1] == '.') {
 1787                                         type = LAST_DOTDOT;
 1788                                         nd->flags |= LOOKUP_JUMPED;
 1789                                 }
 1790                                 break;
 1791                         case 1:
 1792                                 type = LAST_DOT;
 1793                 }
 1794                 if (likely(type == LAST_NORM)) {
 1795                         struct dentry *parent = nd->path.dentry;
 1796                         nd->flags &= ~LOOKUP_JUMPED;
 1797                         if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
 1798                                 err = parent->d_op->d_hash(parent, nd->inode,
 1799                                                            &this);
 1800                                 if (err < 0)
 1801                                         break;
 1802                         }
 1803                 }
 1804 
 1805                 if (!name[len])
 1806                         goto last_component;
 1807                 /*
 1808                  * If it wasn't NUL, we know it was '/'. Skip that
 1809                  * slash, and continue until no more slashes.
 1810                  */
 1811                 do {
 1812                         len++;
 1813                 } while (unlikely(name[len] == '/'));
 1814                 if (!name[len])
 1815                         goto last_component;
 1816                 name += len;
 1817 
 1818                 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
 1819                 if (err < 0)
 1820                         return err;
 1821 
 1822                 if (err) {
 1823                         err = nested_symlink(&next, nd);
 1824                         if (err)
 1825                                 return err;
 1826                 }
 1827                 if (can_lookup(nd->inode))
 1828                         continue;
 1829                 err = -ENOTDIR; 
 1830                 break;
 1831                 /* here ends the main loop */
 1832 
 1833 last_component:
 1834                 nd->last = this;
 1835                 nd->last_type = type;
 1836                 return 0;
 1837         }
 1838         terminate_walk(nd);
 1839         return err;
 1840 }
 1841 
 1842 static int path_init(int dfd, const char *name, unsigned int flags,
 1843                      struct nameidata *nd, struct file **fp)
 1844 {
 1845         int retval = 0;
 1846 
 1847         nd->last_type = LAST_ROOT; /* if there are only slashes... */
 1848         nd->flags = flags | LOOKUP_JUMPED;
 1849         nd->depth = 0;
 1850         if (flags & LOOKUP_ROOT) {
 1851                 struct inode *inode = nd->root.dentry->d_inode;
 1852                 if (*name) {
 1853                         if (!can_lookup(inode))
 1854                                 return -ENOTDIR;
 1855                         retval = inode_permission(inode, MAY_EXEC);
 1856                         if (retval)
 1857                                 return retval;
 1858                 }
 1859                 nd->path = nd->root;
 1860                 nd->inode = inode;
 1861                 if (flags & LOOKUP_RCU) {
 1862                         lock_rcu_walk();
 1863                         nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 1864                 } else {
 1865                         path_get(&nd->path);
 1866                 }
 1867                 return 0;
 1868         }
 1869 
 1870         nd->root.mnt = NULL;
 1871 
 1872         if (*name=='/') {
 1873                 if (flags & LOOKUP_RCU) {
 1874                         lock_rcu_walk();
 1875                         set_root_rcu(nd);
 1876                 } else {
 1877                         set_root(nd);
 1878                         path_get(&nd->root);
 1879                 }
 1880                 nd->path = nd->root;
 1881         } else if (dfd == AT_FDCWD) {
 1882                 if (flags & LOOKUP_RCU) {
 1883                         struct fs_struct *fs = current->fs;
 1884                         unsigned seq;
 1885 
 1886                         lock_rcu_walk();
 1887 
 1888                         do {
 1889                                 seq = read_seqcount_begin(&fs->seq);
 1890                                 nd->path = fs->pwd;
 1891                                 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 1892                         } while (read_seqcount_retry(&fs->seq, seq));
 1893                 } else {
 1894                         get_fs_pwd(current->fs, &nd->path);
 1895                 }
 1896         } else {
 1897                 /* Caller must check execute permissions on the starting path component */
 1898                 struct fd f = fdget_raw(dfd);
 1899                 struct dentry *dentry;
 1900 
 1901                 if (!f.file)
 1902                         return -EBADF;
 1903 
 1904                 dentry = f.file->f_path.dentry;
 1905 
 1906                 if (*name) {
 1907                         if (!can_lookup(dentry->d_inode)) {
 1908                                 fdput(f);
 1909                                 return -ENOTDIR;
 1910                         }
 1911                 }
 1912 
 1913                 nd->path = f.file->f_path;
 1914                 if (flags & LOOKUP_RCU) {
 1915                         if (f.need_put)
 1916                                 *fp = f.file;
 1917                         nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 1918                         lock_rcu_walk();
 1919                 } else {
 1920                         path_get(&nd->path);
 1921                         fdput(f);
 1922                 }
 1923         }
 1924 
 1925         nd->inode = nd->path.dentry->d_inode;
 1926         return 0;
 1927 }
 1928 
 1929 static inline int lookup_last(struct nameidata *nd, struct path *path)
 1930 {
 1931         if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
 1932                 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 1933 
 1934         nd->flags &= ~LOOKUP_PARENT;
 1935         return walk_component(nd, path, &nd->last, nd->last_type,
 1936                                         nd->flags & LOOKUP_FOLLOW);
 1937 }
 1938 
 1939 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 1940 static int path_lookupat(int dfd, const char *name,
 1941                                 unsigned int flags, struct nameidata *nd)
 1942 {
 1943         struct file *base = NULL;
 1944         struct path path;
 1945         int err;
 1946 
 1947         /*
 1948          * Path walking is largely split up into 2 different synchronisation
 1949          * schemes, rcu-walk and ref-walk (explained in
 1950          * Documentation/filesystems/path-lookup.txt). These share much of the
 1951          * path walk code, but some things particularly setup, cleanup, and
 1952          * following mounts are sufficiently divergent that functions are
 1953          * duplicated. Typically there is a function foo(), and its RCU
 1954          * analogue, foo_rcu().
 1955          *
 1956          * -ECHILD is the error number of choice (just to avoid clashes) that
 1957          * is returned if some aspect of an rcu-walk fails. Such an error must
 1958          * be handled by restarting a traditional ref-walk (which will always
 1959          * be able to complete).
 1960          */
 1961         err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
 1962 
 1963         if (unlikely(err))
 1964                 return err;
 1965 
 1966         current->total_link_count = 0;
 1967         err = link_path_walk(name, nd);
 1968 
 1969         if (!err && !(flags & LOOKUP_PARENT)) {
 1970                 err = lookup_last(nd, &path);
 1971                 while (err > 0) {
 1972                         void *cookie;
 1973                         struct path link = path;
 1974                         err = may_follow_link(&link, nd);
 1975                         if (unlikely(err))
 1976                                 break;
 1977                         nd->flags |= LOOKUP_PARENT;
 1978                         err = follow_link(&link, nd, &cookie);
 1979                         if (err)
 1980                                 break;
 1981                         err = lookup_last(nd, &path);
 1982                         put_link(nd, &link, cookie);
 1983                 }
 1984         }
 1985 
 1986         if (!err)
 1987                 err = complete_walk(nd);
 1988 
 1989         if (!err && nd->flags & LOOKUP_DIRECTORY) {
 1990                 if (!nd->inode->i_op->lookup) {
 1991                         path_put(&nd->path);
 1992                         err = -ENOTDIR;
 1993                 }
 1994         }
 1995 
 1996         if (base)
 1997                 fput(base);
 1998 
 1999         if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 2000                 path_put(&nd->root);
 2001                 nd->root.mnt = NULL;
 2002         }
 2003         return err;
 2004 }
 2005 
 2006 static int filename_lookup(int dfd, struct filename *name,
 2007                                 unsigned int flags, struct nameidata *nd)
 2008 {
 2009         int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
 2010         if (unlikely(retval == -ECHILD))
 2011                 retval = path_lookupat(dfd, name->name, flags, nd);
 2012         if (unlikely(retval == -ESTALE))
 2013                 retval = path_lookupat(dfd, name->name,
 2014                                                 flags | LOOKUP_REVAL, nd);
 2015 
 2016         if (likely(!retval))
 2017                 audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
 2018         return retval;
 2019 }
 2020 
 2021 static int do_path_lookup(int dfd, const char *name,
 2022                                 unsigned int flags, struct nameidata *nd)
 2023 {
 2024         struct filename filename = { .name = name };
 2025 
 2026         return filename_lookup(dfd, &filename, flags, nd);
 2027 }
 2028 
 2029 /* does lookup, returns the object with parent locked */
 2030 struct dentry *kern_path_locked(const char *name, struct path *path)
 2031 {
 2032         struct nameidata nd;
 2033         struct dentry *d;
 2034         int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
 2035         if (err)
 2036                 return ERR_PTR(err);
 2037         if (nd.last_type != LAST_NORM) {
 2038                 path_put(&nd.path);
 2039                 return ERR_PTR(-EINVAL);
 2040         }
 2041         mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 2042         d = __lookup_hash(&nd.last, nd.path.dentry, 0);
 2043         if (IS_ERR(d)) {
 2044                 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 2045                 path_put(&nd.path);
 2046                 return d;
 2047         }
 2048         *path = nd.path;
 2049         return d;
 2050 }
 2051 
 2052 int kern_path(const char *name, unsigned int flags, struct path *path)
 2053 {
 2054         struct nameidata nd;
 2055         int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
 2056         if (!res)
 2057                 *path = nd.path;
 2058         return res;
 2059 }
 2060 
 2061 /**
 2062  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 2063  * @dentry:  pointer to dentry of the base directory
 2064  * @mnt: pointer to vfs mount of the base directory
 2065  * @name: pointer to file name
 2066  * @flags: lookup flags
 2067  * @path: pointer to struct path to fill
 2068  */
 2069 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 2070                     const char *name, unsigned int flags,
 2071                     struct path *path)
 2072 {
 2073         struct nameidata nd;
 2074         int err;
 2075         nd.root.dentry = dentry;
 2076         nd.root.mnt = mnt;
 2077         BUG_ON(flags & LOOKUP_PARENT);
 2078         /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
 2079         err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
 2080         if (!err)
 2081                 *path = nd.path;
 2082         return err;
 2083 }
 2084 
 2085 /*
 2086  * Restricted form of lookup. Doesn't follow links, single-component only,
 2087  * needs parent already locked. Doesn't follow mounts.
 2088  * SMP-safe.
 2089  */
 2090 static struct dentry *lookup_hash(struct nameidata *nd)
 2091 {
 2092         return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
 2093 }
 2094 
 2095 /**
 2096  * lookup_one_len - filesystem helper to lookup single pathname component
 2097  * @name:       pathname component to lookup
 2098  * @base:       base directory to lookup from
 2099  * @len:        maximum length @len should be interpreted to
 2100  *
 2101  * Note that this routine is purely a helper for filesystem usage and should
 2102  * not be called by generic code.  Also note that by using this function the
 2103  * nameidata argument is passed to the filesystem methods and a filesystem
 2104  * using this helper needs to be prepared for that.
 2105  */
 2106 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 2107 {
 2108         struct qstr this;
 2109         unsigned int c;
 2110         int err;
 2111 
 2112         WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
 2113 
 2114         this.name = name;
 2115         this.len = len;
 2116         this.hash = full_name_hash(name, len);
 2117         if (!len)
 2118                 return ERR_PTR(-EACCES);
 2119 
 2120         if (unlikely(name[0] == '.')) {
 2121                 if (len < 2 || (len == 2 && name[1] == '.'))
 2122                         return ERR_PTR(-EACCES);
 2123         }
 2124 
 2125         while (len--) {
 2126                 c = *(const unsigned char *)name++;
 2127                 if (c == '/' || c == '\0')
 2128                         return ERR_PTR(-EACCES);
 2129         }
 2130         /*
 2131          * See if the low-level filesystem might want
 2132          * to use its own hash..
 2133          */
 2134         if (base->d_flags & DCACHE_OP_HASH) {
 2135                 int err = base->d_op->d_hash(base, base->d_inode, &this);
 2136                 if (err < 0)
 2137                         return ERR_PTR(err);
 2138         }
 2139 
 2140         err = inode_permission(base->d_inode, MAY_EXEC);
 2141         if (err)
 2142                 return ERR_PTR(err);
 2143 
 2144         return __lookup_hash(&this, base, 0);
 2145 }
 2146 
 2147 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 2148                  struct path *path, int *empty)
 2149 {
 2150         struct nameidata nd;
 2151         struct filename *tmp = getname_flags(name, flags, empty);
 2152         int err = PTR_ERR(tmp);
 2153         if (!IS_ERR(tmp)) {
 2154 
 2155                 BUG_ON(flags & LOOKUP_PARENT);
 2156 
 2157                 err = filename_lookup(dfd, tmp, flags, &nd);
 2158                 putname(tmp);
 2159                 if (!err)
 2160                         *path = nd.path;
 2161         }
 2162         return err;
 2163 }
 2164 
 2165 int user_path_at(int dfd, const char __user *name, unsigned flags,
 2166                  struct path *path)
 2167 {
 2168         return user_path_at_empty(dfd, name, flags, path, NULL);
 2169 }
 2170 
 2171 /*
 2172  * NB: most callers don't do anything directly with the reference to the
 2173  *     to struct filename, but the nd->last pointer points into the name string
 2174  *     allocated by getname. So we must hold the reference to it until all
 2175  *     path-walking is complete.
 2176  */
 2177 static struct filename *
 2178 user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
 2179                  unsigned int flags)
 2180 {
 2181         struct filename *s = getname(path);
 2182         int error;
 2183 
 2184         /* only LOOKUP_REVAL is allowed in extra flags */
 2185         flags &= LOOKUP_REVAL;
 2186 
 2187         if (IS_ERR(s))
 2188                 return s;
 2189 
 2190         error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
 2191         if (error) {
 2192                 putname(s);
 2193                 return ERR_PTR(error);
 2194         }
 2195 
 2196         return s;
 2197 }
 2198 
 2199 /*
 2200  * It's inline, so penalty for filesystems that don't use sticky bit is
 2201  * minimal.
 2202  */
 2203 static inline int check_sticky(struct inode *dir, struct inode *inode)
 2204 {
 2205         kuid_t fsuid = current_fsuid();
 2206 
 2207         if (!(dir->i_mode & S_ISVTX))
 2208                 return 0;
 2209         if (uid_eq(inode->i_uid, fsuid))
 2210                 return 0;
 2211         if (uid_eq(dir->i_uid, fsuid))
 2212                 return 0;
 2213         return !inode_capable(inode, CAP_FOWNER);
 2214 }
 2215 
 2216 /*
 2217  *      Check whether we can remove a link victim from directory dir, check
 2218  *  whether the type of victim is right.
 2219  *  1. We can't do it if dir is read-only (done in permission())
 2220  *  2. We should have write and exec permissions on dir
 2221  *  3. We can't remove anything from append-only dir
 2222  *  4. We can't do anything with immutable dir (done in permission())
 2223  *  5. If the sticky bit on dir is set we should either
 2224  *      a. be owner of dir, or
 2225  *      b. be owner of victim, or
 2226  *      c. have CAP_FOWNER capability
 2227  *  6. If the victim is append-only or immutable we can't do antyhing with
 2228  *     links pointing to it.
 2229  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 2230  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 2231  *  9. We can't remove a root or mountpoint.
 2232  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 2233  *     nfs_async_unlink().
 2234  */
 2235 static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 2236 {
 2237         int error;
 2238 
 2239         if (!victim->d_inode)
 2240                 return -ENOENT;
 2241 
 2242         BUG_ON(victim->d_parent->d_inode != dir);
 2243         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 2244 
 2245         error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 2246         if (error)
 2247                 return error;
 2248         if (IS_APPEND(dir))
 2249                 return -EPERM;
 2250         if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
 2251             IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
 2252                 return -EPERM;
 2253         if (isdir) {
 2254                 if (!S_ISDIR(victim->d_inode->i_mode))
 2255                         return -ENOTDIR;
 2256                 if (IS_ROOT(victim))
 2257                         return -EBUSY;
 2258         } else if (S_ISDIR(victim->d_inode->i_mode))
 2259                 return -EISDIR;
 2260         if (IS_DEADDIR(dir))
 2261                 return -ENOENT;
 2262         if (victim->d_flags & DCACHE_NFSFS_RENAMED)
 2263                 return -EBUSY;
 2264         return 0;
 2265 }
 2266 
 2267 /*      Check whether we can create an object with dentry child in directory
 2268  *  dir.
 2269  *  1. We can't do it if child already exists (open has special treatment for
 2270  *     this case, but since we are inlined it's OK)
 2271  *  2. We can't do it if dir is read-only (done in permission())
 2272  *  3. We should have write and exec permissions on dir
 2273  *  4. We can't do it if dir is immutable (done in permission())
 2274  */
 2275 static inline int may_create(struct inode *dir, struct dentry *child)
 2276 {
 2277         if (child->d_inode)
 2278                 return -EEXIST;
 2279         if (IS_DEADDIR(dir))
 2280                 return -ENOENT;
 2281         return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 2282 }
 2283 
 2284 /*
 2285  * p1 and p2 should be directories on the same fs.
 2286  */
 2287 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 2288 {
 2289         struct dentry *p;
 2290 
 2291         if (p1 == p2) {
 2292                 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
 2293                 return NULL;
 2294         }
 2295 
 2296         mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 2297 
 2298         p = d_ancestor(p2, p1);
 2299         if (p) {
 2300                 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
 2301                 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
 2302                 return p;
 2303         }
 2304 
 2305         p = d_ancestor(p1, p2);
 2306         if (p) {
 2307                 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
 2308                 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
 2309                 return p;
 2310         }
 2311 
 2312         mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
 2313         mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
 2314         return NULL;
 2315 }
 2316 
 2317 void unlock_rename(struct dentry *p1, struct dentry *p2)
 2318 {
 2319         mutex_unlock(&p1->d_inode->i_mutex);
 2320         if (p1 != p2) {
 2321                 mutex_unlock(&p2->d_inode->i_mutex);
 2322                 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 2323         }
 2324 }
 2325 
 2326 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 2327                 bool want_excl)
 2328 {
 2329         int error = may_create(dir, dentry);
 2330         if (error)
 2331                 return error;
 2332 
 2333         if (!dir->i_op->create)
 2334                 return -EACCES; /* shouldn't it be ENOSYS? */
 2335         mode &= S_IALLUGO;
 2336         mode |= S_IFREG;
 2337         error = security_inode_create(dir, dentry, mode);
 2338         if (error)
 2339                 return error;
 2340         error = dir->i_op->create(dir, dentry, mode, want_excl);
 2341         if (!error)
 2342                 fsnotify_create(dir, dentry);
 2343         return error;
 2344 }
 2345 
 2346 static int may_open(struct path *path, int acc_mode, int flag)
 2347 {
 2348         struct dentry *dentry = path->dentry;
 2349         struct inode *inode = dentry->d_inode;
 2350         int error;
 2351 
 2352         /* O_PATH? */
 2353         if (!acc_mode)
 2354                 return 0;
 2355 
 2356         if (!inode)
 2357                 return -ENOENT;
 2358 
 2359         switch (inode->i_mode & S_IFMT) {
 2360         case S_IFLNK:
 2361                 return -ELOOP;
 2362         case S_IFDIR:
 2363                 if (acc_mode & MAY_WRITE)
 2364                         return -EISDIR;
 2365                 break;
 2366         case S_IFBLK:
 2367         case S_IFCHR:
 2368                 if (path->mnt->mnt_flags & MNT_NODEV)
 2369                         return -EACCES;
 2370                 /*FALLTHRU*/
 2371         case S_IFIFO:
 2372         case S_IFSOCK:
 2373                 flag &= ~O_TRUNC;
 2374                 break;
 2375         }
 2376 
 2377         error = inode_permission(inode, acc_mode);
 2378         if (error)
 2379                 return error;
 2380 
 2381         /*
 2382          * An append-only file must be opened in append mode for writing.
 2383          */
 2384         if (IS_APPEND(inode)) {
 2385                 if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
 2386                         return -EPERM;
 2387                 if (flag & O_TRUNC)
 2388                         return -EPERM;
 2389         }
 2390 
 2391         /* O_NOATIME can only be set by the owner or superuser */
 2392         if (flag & O_NOATIME && !inode_owner_or_capable(inode))
 2393                 return -EPERM;
 2394 
 2395         return 0;
 2396 }
 2397 
 2398 static int handle_truncate(struct file *filp)
 2399 {
 2400         struct path *path = &filp->f_path;
 2401         struct inode *inode = path->dentry->d_inode;
 2402         int error = get_write_access(inode);
 2403         if (error)
 2404                 return error;
 2405         /*
 2406          * Refuse to truncate files with mandatory locks held on them.
 2407          */
 2408         error = locks_verify_locked(inode);
 2409         if (!error)
 2410                 error = security_path_truncate(path);
 2411         if (!error) {
 2412                 error = do_truncate(path->dentry, 0,
 2413                                     ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
 2414                                     filp);
 2415         }
 2416         put_write_access(inode);
 2417         return error;
 2418 }
 2419 
 2420 static inline int open_to_namei_flags(int flag)
 2421 {
 2422         if ((flag & O_ACCMODE) == 3)
 2423                 flag--;
 2424         return flag;
 2425 }
 2426 
 2427 static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
 2428 {
 2429         int error = security_path_mknod(dir, dentry, mode, 0);
 2430         if (error)
 2431                 return error;
 2432 
 2433         error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
 2434         if (error)
 2435                 return error;
 2436 
 2437         return security_inode_create(dir->dentry->d_inode, dentry, mode);
 2438 }
 2439 
 2440 /*
 2441  * Attempt to atomically look up, create and open a file from a negative
 2442  * dentry.
 2443  *
 2444  * Returns 0 if successful.  The file will have been created and attached to
 2445  * @file by the filesystem calling finish_open().
 2446  *
 2447  * Returns 1 if the file was looked up only or didn't need creating.  The
 2448  * caller will need to perform the open themselves.  @path will have been
 2449  * updated to point to the new dentry.  This may be negative.
 2450  *
 2451  * Returns an error code otherwise.
 2452  */
 2453 static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 2454                         struct path *path, struct file *file,
 2455                         const struct open_flags *op,
 2456                         bool got_write, bool need_lookup,
 2457                         int *opened)
 2458 {
 2459         struct inode *dir =  nd->path.dentry->d_inode;
 2460         unsigned open_flag = open_to_namei_flags(op->open_flag);
 2461         umode_t mode;
 2462         int error;
 2463         int acc_mode;
 2464         int create_error = 0;
 2465         struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
 2466 
 2467         BUG_ON(dentry->d_inode);
 2468 
 2469         /* Don't create child dentry for a dead directory. */
 2470         if (unlikely(IS_DEADDIR(dir))) {
 2471                 error = -ENOENT;
 2472                 goto out;
 2473         }
 2474 
 2475         mode = op->mode;
 2476         if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
 2477                 mode &= ~current_umask();
 2478 
 2479         if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
 2480                 open_flag &= ~O_TRUNC;
 2481                 *opened |= FILE_CREATED;
 2482         }
 2483 
 2484         /*
 2485          * Checking write permission is tricky, bacuse we don't know if we are
 2486          * going to actually need it: O_CREAT opens should work as long as the
 2487          * file exists.  But checking existence breaks atomicity.  The trick is
 2488          * to check access and if not granted clear O_CREAT from the flags.
 2489          *
 2490          * Another problem is returing the "right" error value (e.g. for an
 2491          * O_EXCL open we want to return EEXIST not EROFS).
 2492          */
 2493         if (((open_flag & (O_CREAT | O_TRUNC)) ||
 2494             (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
 2495                 if (!(open_flag & O_CREAT)) {
 2496                         /*
 2497                          * No O_CREATE -> atomicity not a requirement -> fall
 2498                          * back to lookup + open
 2499                          */
 2500                         goto no_open;
 2501                 } else if (open_flag & (O_EXCL | O_TRUNC)) {
 2502                         /* Fall back and fail with the right error */
 2503                         create_error = -EROFS;
 2504                         goto no_open;
 2505                 } else {
 2506                         /* No side effects, safe to clear O_CREAT */
 2507                         create_error = -EROFS;
 2508                         open_flag &= ~O_CREAT;
 2509                 }
 2510         }
 2511 
 2512         if (open_flag & O_CREAT) {
 2513                 error = may_o_create(&nd->path, dentry, mode);
 2514                 if (error) {
 2515                         create_error = error;
 2516                         if (open_flag & O_EXCL)
 2517                                 goto no_open;
 2518                         open_flag &= ~O_CREAT;
 2519                 }
 2520         }
 2521 
 2522         if (nd->flags & LOOKUP_DIRECTORY)
 2523                 open_flag |= O_DIRECTORY;
 2524 
 2525         file->f_path.dentry = DENTRY_NOT_SET;
 2526         file->f_path.mnt = nd->path.mnt;
 2527         error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
 2528                                       opened);
 2529         if (error < 0) {
 2530                 if (create_error && error == -ENOENT)
 2531                         error = create_error;
 2532                 goto out;
 2533         }
 2534 
 2535         acc_mode = op->acc_mode;
 2536         if (*opened & FILE_CREATED) {
 2537                 fsnotify_create(dir, dentry);
 2538                 acc_mode = MAY_OPEN;
 2539         }
 2540 
 2541         if (error) {    /* returned 1, that is */
 2542                 if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
 2543                         error = -EIO;
 2544                         goto out;
 2545                 }
 2546                 if (file->f_path.dentry) {
 2547                         dput(dentry);
 2548                         dentry = file->f_path.dentry;
 2549                 }
 2550                 if (create_error && dentry->d_inode == NULL) {
 2551                         error = create_error;
 2552                         goto out;
 2553                 }
 2554                 goto looked_up;
 2555         }
 2556 
 2557         /*
 2558          * We didn't have the inode before the open, so check open permission
 2559          * here.
 2560          */
 2561         error = may_open(&file->f_path, acc_mode, open_flag);
 2562         if (error)
 2563                 fput(file);
 2564 
 2565 out:
 2566         dput(dentry);
 2567         return error;
 2568 
 2569 no_open:
 2570         if (need_lookup) {
 2571                 dentry = lookup_real(dir, dentry, nd->flags);
 2572                 if (IS_ERR(dentry))
 2573                         return PTR_ERR(dentry);
 2574 
 2575                 if (create_error) {
 2576                         int open_flag = op->open_flag;
 2577 
 2578                         error = create_error;
 2579                         if ((open_flag & O_EXCL)) {
 2580                                 if (!dentry->d_inode)
 2581                                         goto out;
 2582                         } else if (!dentry->d_inode) {
 2583                                 goto out;
 2584                         } else if ((open_flag & O_TRUNC) &&
 2585                                    S_ISREG(dentry->d_inode->i_mode)) {
 2586                                 goto out;
 2587                         }
 2588                         /* will fail later, go on to get the right error */
 2589                 }
 2590         }
 2591 looked_up:
 2592         path->dentry = dentry;
 2593         path->mnt = nd->path.mnt;
 2594         return 1;
 2595 }
 2596 
 2597 /*
 2598  * Look up and maybe create and open the last component.
 2599  *
 2600  * Must be called with i_mutex held on parent.
 2601  *
 2602  * Returns 0 if the file was successfully atomically created (if necessary) and
 2603  * opened.  In this case the file will be returned attached to @file.
 2604  *
 2605  * Returns 1 if the file was not completely opened at this time, though lookups
 2606  * and creations will have been performed and the dentry returned in @path will
 2607  * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
 2608  * specified then a negative dentry may be returned.
 2609  *
 2610  * An error code is returned otherwise.
 2611  *
 2612  * FILE_CREATE will be set in @*opened if the dentry was created and will be
 2613  * cleared otherwise prior to returning.
 2614  */
 2615 static int lookup_open(struct nameidata *nd, struct path *path,
 2616                         struct file *file,
 2617                         const struct open_flags *op,
 2618                         bool got_write, int *opened)
 2619 {
 2620         struct dentry *dir = nd->path.dentry;
 2621         struct inode *dir_inode = dir->d_inode;
 2622         struct dentry *dentry;
 2623         int error;
 2624         bool need_lookup;
 2625 
 2626         *opened &= ~FILE_CREATED;
 2627         dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
 2628         if (IS_ERR(dentry))
 2629                 return PTR_ERR(dentry);
 2630 
 2631         /* Cached positive dentry: will open in f_op->open */
 2632         if (!need_lookup && dentry->d_inode)
 2633                 goto out_no_open;
 2634 
 2635         if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
 2636                 return atomic_open(nd, dentry, path, file, op, got_write,
 2637                                    need_lookup, opened);
 2638         }
 2639 
 2640         if (need_lookup) {
 2641                 BUG_ON(dentry->d_inode);
 2642 
 2643                 dentry = lookup_real(dir_inode, dentry, nd->flags);
 2644                 if (IS_ERR(dentry))
 2645                         return PTR_ERR(dentry);
 2646         }
 2647 
 2648         /* Negative dentry, just create the file */
 2649         if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
 2650                 umode_t mode = op->mode;
 2651                 if (!IS_POSIXACL(dir->d_inode))
 2652                         mode &= ~current_umask();
 2653                 /*
 2654                  * This write is needed to ensure that a
 2655                  * rw->ro transition does not occur between
 2656                  * the time when the file is created and when
 2657                  * a permanent write count is taken through
 2658                  * the 'struct file' in finish_open().
 2659                  */
 2660                 if (!got_write) {
 2661                         error = -EROFS;
 2662                         goto out_dput;
 2663                 }
 2664                 *opened |= FILE_CREATED;
 2665                 error = security_path_mknod(&nd->path, dentry, mode, 0);
 2666                 if (error)
 2667                         goto out_dput;
 2668                 error = vfs_create(dir->d_inode, dentry, mode,
 2669                                    nd->flags & LOOKUP_EXCL);
 2670                 if (error)
 2671                         goto out_dput;
 2672         }
 2673 out_no_open:
 2674         path->dentry = dentry;
 2675         path->mnt = nd->path.mnt;
 2676         return 1;
 2677 
 2678 out_dput:
 2679         dput(dentry);
 2680         return error;
 2681 }
 2682 
 2683 /*
 2684  * Handle the last step of open()
 2685  */
 2686 static int do_last(struct nameidata *nd, struct path *path,
 2687                    struct file *file, const struct open_flags *op,
 2688                    int *opened, struct filename *name)
 2689 {
 2690         struct dentry *dir = nd->path.dentry;
 2691         int open_flag = op->open_flag;
 2692         bool will_truncate = (open_flag & O_TRUNC) != 0;
 2693         bool got_write = false;
 2694         int acc_mode = op->acc_mode;
 2695         struct inode *inode;
 2696         bool symlink_ok = false;
 2697         struct path save_parent = { .dentry = NULL, .mnt = NULL };
 2698         bool retried = false;
 2699         int error;
 2700 
 2701         nd->flags &= ~LOOKUP_PARENT;
 2702         nd->flags |= op->intent;
 2703 
 2704         switch (nd->last_type) {
 2705         case LAST_DOTDOT:
 2706         case LAST_DOT:
 2707                 error = handle_dots(nd, nd->last_type);
 2708                 if (error)
 2709                         return error;
 2710                 /* fallthrough */
 2711         case LAST_ROOT:
 2712                 error = complete_walk(nd);
 2713                 if (error)
 2714                         return error;
 2715                 audit_inode(name, nd->path.dentry, 0);
 2716                 if (open_flag & O_CREAT) {
 2717                         error = -EISDIR;
 2718                         goto out;
 2719                 }
 2720                 goto finish_open;
 2721         case LAST_BIND:
 2722                 error = complete_walk(nd);
 2723                 if (error)
 2724                         return error;
 2725                 audit_inode(name, dir, 0);
 2726                 goto finish_open;
 2727         }
 2728 
 2729         if (!(open_flag & O_CREAT)) {
 2730                 if (nd->last.name[nd->last.len])
 2731                         nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 2732                 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
 2733                         symlink_ok = true;
 2734                 /* we _can_ be in RCU mode here */
 2735                 error = lookup_fast(nd, &nd->last, path, &inode);
 2736                 if (likely(!error))
 2737                         goto finish_lookup;
 2738 
 2739                 if (error < 0)
 2740                         goto out;
 2741 
 2742                 BUG_ON(nd->inode != dir->d_inode);
 2743         } else {
 2744                 /* create side of things */
 2745                 /*
 2746                  * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
 2747                  * has been cleared when we got to the last component we are
 2748                  * about to look up
 2749                  */
 2750                 error = complete_walk(nd);
 2751                 if (error)
 2752                         return error;
 2753 
 2754                 audit_inode(name, dir, 0);
 2755                 error = -EISDIR;
 2756                 /* trailing slashes? */
 2757                 if (nd->last.name[nd->last.len])
 2758                         goto out;
 2759         }
 2760 
 2761 retry_lookup:
 2762         if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
 2763                 error = mnt_want_write(nd->path.mnt);
 2764                 if (!error)
 2765                         got_write = true;
 2766                 /*
 2767                  * do _not_ fail yet - we might not need that or fail with
 2768                  * a different error; let lookup_open() decide; we'll be
 2769                  * dropping this one anyway.
 2770                  */
 2771         }
 2772         mutex_lock(&dir->d_inode->i_mutex);
 2773         error = lookup_open(nd, path, file, op, got_write, opened);
 2774         mutex_unlock(&dir->d_inode->i_mutex);
 2775 
 2776         if (error <= 0) {
 2777                 if (error)
 2778                         goto out;
 2779 
 2780                 if ((*opened & FILE_CREATED) ||
 2781                     !S_ISREG(file->f_path.dentry->d_inode->i_mode))
 2782                         will_truncate = false;
 2783 
 2784                 audit_inode(name, file->f_path.dentry, 0);
 2785                 goto opened;
 2786         }
 2787 
 2788         if (*opened & FILE_CREATED) {
 2789                 /* Don't check for write permission, don't truncate */
 2790                 open_flag &= ~O_TRUNC;
 2791                 will_truncate = false;
 2792                 acc_mode = MAY_OPEN;
 2793                 path_to_nameidata(path, nd);
 2794                 goto finish_open_created;
 2795         }
 2796 
 2797         /*
 2798          * create/update audit record if it already exists.
 2799          */
 2800         if (path->dentry->d_inode)
 2801                 audit_inode(name, path->dentry, 0);
 2802 
 2803         /*
 2804          * If atomic_open() acquired write access it is dropped now due to
 2805          * possible mount and symlink following (this might be optimized away if
 2806          * necessary...)
 2807          */
 2808         if (got_write) {
 2809                 mnt_drop_write(nd->path.mnt);
 2810                 got_write = false;
 2811         }
 2812 
 2813         error = -EEXIST;
 2814         if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
 2815                 goto exit_dput;
 2816 
 2817         error = follow_managed(path, nd->flags);
 2818         if (error < 0)
 2819                 goto exit_dput;
 2820 
 2821         if (error)
 2822                 nd->flags |= LOOKUP_JUMPED;
 2823 
 2824         BUG_ON(nd->flags & LOOKUP_RCU);
 2825         inode = path->dentry->d_inode;
 2826 finish_lookup:
 2827         /* we _can_ be in RCU mode here */
 2828         error = -ENOENT;
 2829         if (!inode) {
 2830                 path_to_nameidata(path, nd);
 2831                 goto out;
 2832         }
 2833 
 2834         if (should_follow_link(inode, !symlink_ok)) {
 2835                 if (nd->flags & LOOKUP_RCU) {
 2836                         if (unlikely(unlazy_walk(nd, path->dentry))) {
 2837                                 error = -ECHILD;
 2838                                 goto out;
 2839                         }
 2840                 }
 2841                 BUG_ON(inode != path->dentry->d_inode);
 2842                 return 1;
 2843         }
 2844 
 2845         if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
 2846                 path_to_nameidata(path, nd);
 2847         } else {
 2848                 save_parent.dentry = nd->path.dentry;
 2849                 save_parent.mnt = mntget(path->mnt);
 2850                 nd->path.dentry = path->dentry;
 2851 
 2852         }
 2853         nd->inode = inode;
 2854         /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 2855         error = complete_walk(nd);
 2856         if (error) {
 2857                 path_put(&save_parent);
 2858                 return error;
 2859         }
 2860         error = -EISDIR;
 2861         if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
 2862                 goto out;
 2863         error = -ENOTDIR;
 2864         if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
 2865                 goto out;
 2866         audit_inode(name, nd->path.dentry, 0);
 2867 finish_open:
 2868         if (!S_ISREG(nd->inode->i_mode))
 2869                 will_truncate = false;
 2870 
 2871         if (will_truncate) {
 2872                 error = mnt_want_write(nd->path.mnt);
 2873                 if (error)
 2874                         goto out;
 2875                 got_write = true;
 2876         }
 2877 finish_open_created:
 2878         error = may_open(&nd->path, acc_mode, open_flag);
 2879         if (error)
 2880                 goto out;
 2881         file->f_path.mnt = nd->path.mnt;
 2882         error = finish_open(file, nd->path.dentry, NULL, opened);
 2883         if (error) {
 2884                 if (error == -EOPENSTALE)
 2885                         goto stale_open;
 2886                 goto out;
 2887         }
 2888 opened:
 2889         error = open_check_o_direct(file);
 2890         if (error)
 2891                 goto exit_fput;
 2892         error = ima_file_check(file, op->acc_mode);
 2893         if (error)
 2894                 goto exit_fput;
 2895 
 2896         if (will_truncate) {
 2897                 error = handle_truncate(file);
 2898                 if (error)
 2899                         goto exit_fput;
 2900         }
 2901 out:
 2902         if (got_write)
 2903                 mnt_drop_write(nd->path.mnt);
 2904         path_put(&save_parent);
 2905         terminate_walk(nd);
 2906         return error;
 2907 
 2908 exit_dput:
 2909         path_put_conditional(path, nd);
 2910         goto out;
 2911 exit_fput:
 2912         fput(file);
 2913         goto out;
 2914 
 2915 stale_open:
 2916         /* If no saved parent or already retried then can't retry */
 2917         if (!save_parent.dentry || retried)
 2918                 goto out;
 2919 
 2920         BUG_ON(save_parent.dentry != dir);
 2921         path_put(&nd->path);
 2922         nd->path = save_parent;
 2923         nd->inode = dir->d_inode;
 2924         save_parent.mnt = NULL;
 2925         save_parent.dentry = NULL;
 2926         if (got_write) {
 2927                 mnt_drop_write(nd->path.mnt);
 2928                 got_write = false;
 2929         }
 2930         retried = true;
 2931         goto retry_lookup;
 2932 }
 2933 
 2934 static struct file *path_openat(int dfd, struct filename *pathname,
 2935                 struct nameidata *nd, const struct open_flags *op, int flags)
 2936 {
 2937         struct file *base = NULL;
 2938         struct file *file;
 2939         struct path path;
 2940         int opened = 0;
 2941         int error;
 2942 
 2943         file = get_empty_filp();
 2944         if (!file)
 2945                 return ERR_PTR(-ENFILE);
 2946 
 2947         file->f_flags = op->open_flag;
 2948 
 2949         error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
 2950         if (unlikely(error))
 2951                 goto out;
 2952 
 2953         current->total_link_count = 0;
 2954         error = link_path_walk(pathname->name, nd);
 2955         if (unlikely(error))
 2956                 goto out;
 2957 
 2958         error = do_last(nd, &path, file, op, &opened, pathname);
 2959         while (unlikely(error > 0)) { /* trailing symlink */
 2960                 struct path link = path;
 2961                 void *cookie;
 2962                 if (!(nd->flags & LOOKUP_FOLLOW)) {
 2963                         path_put_conditional(&path, nd);
 2964                         path_put(&nd->path);
 2965                         error = -ELOOP;
 2966                         break;
 2967                 }
 2968                 error = may_follow_link(&link, nd);
 2969                 if (unlikely(error))
 2970                         break;
 2971                 nd->flags |= LOOKUP_PARENT;
 2972                 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
 2973                 error = follow_link(&link, nd, &cookie);
 2974                 if (unlikely(error))
 2975                         break;
 2976                 error = do_last(nd, &path, file, op, &opened, pathname);
 2977                 put_link(nd, &link, cookie);
 2978         }
 2979 out:
 2980         if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
 2981                 path_put(&nd->root);
 2982         if (base)
 2983                 fput(base);
 2984         if (!(opened & FILE_OPENED)) {
 2985                 BUG_ON(!error);
 2986                 put_filp(file);
 2987         }
 2988         if (unlikely(error)) {
 2989                 if (error == -EOPENSTALE) {
 2990                         if (flags & LOOKUP_RCU)
 2991                                 error = -ECHILD;
 2992                         else
 2993                                 error = -ESTALE;
 2994                 }
 2995                 file = ERR_PTR(error);
 2996         }
 2997         return file;
 2998 }
 2999 
 3000 struct file *do_filp_open(int dfd, struct filename *pathname,
 3001                 const struct open_flags *op, int flags)
 3002 {
 3003         struct nameidata nd;
 3004         struct file *filp;
 3005 
 3006         filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
 3007         if (unlikely(filp == ERR_PTR(-ECHILD)))
 3008                 filp = path_openat(dfd, pathname, &nd, op, flags);
 3009         if (unlikely(filp == ERR_PTR(-ESTALE)))
 3010                 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
 3011         return filp;
 3012 }
 3013 
 3014 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 3015                 const char *name, const struct open_flags *op, int flags)
 3016 {
 3017         struct nameidata nd;
 3018         struct file *file;
 3019         struct filename filename = { .name = name };
 3020 
 3021         nd.root.mnt = mnt;
 3022         nd.root.dentry = dentry;
 3023 
 3024         flags |= LOOKUP_ROOT;
 3025 
 3026         if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
 3027                 return ERR_PTR(-ELOOP);
 3028 
 3029         file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
 3030         if (unlikely(file == ERR_PTR(-ECHILD)))
 3031                 file = path_openat(-1, &filename, &nd, op, flags);
 3032         if (unlikely(file == ERR_PTR(-ESTALE)))
 3033                 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
 3034         return file;
 3035 }
 3036 
 3037 struct dentry *kern_path_create(int dfd, const char *pathname,
 3038                                 struct path *path, unsigned int lookup_flags)
 3039 {
 3040         struct dentry *dentry = ERR_PTR(-EEXIST);
 3041         struct nameidata nd;
 3042         int err2;
 3043         int error;
 3044         bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
 3045 
 3046         /*
 3047          * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
 3048          * other flags passed in are ignored!
 3049          */
 3050         lookup_flags &= LOOKUP_REVAL;
 3051 
 3052         error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
 3053         if (error)
 3054                 return ERR_PTR(error);
 3055 
 3056         /*
 3057          * Yucky last component or no last component at all?
 3058          * (foo/., foo/.., /////)
 3059          */
 3060         if (nd.last_type != LAST_NORM)
 3061                 goto out;
 3062         nd.flags &= ~LOOKUP_PARENT;
 3063         nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
 3064 
 3065         /* don't fail immediately if it's r/o, at least try to report other errors */
 3066         err2 = mnt_want_write(nd.path.mnt);
 3067         /*
 3068          * Do the final lookup.
 3069          */
 3070         mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 3071         dentry = lookup_hash(&nd);
 3072         if (IS_ERR(dentry))
 3073                 goto unlock;
 3074 
 3075         error = -EEXIST;
 3076         if (dentry->d_inode)
 3077                 goto fail;
 3078         /*
 3079          * Special case - lookup gave negative, but... we had foo/bar/
 3080          * From the vfs_mknod() POV we just have a negative dentry -
 3081          * all is fine. Let's be bastards - you had / on the end, you've
 3082          * been asking for (non-existent) directory. -ENOENT for you.
 3083          */
 3084         if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
 3085                 error = -ENOENT;
 3086                 goto fail;
 3087         }
 3088         if (unlikely(err2)) {
 3089                 error = err2;
 3090                 goto fail;
 3091         }
 3092         *path = nd.path;
 3093         return dentry;
 3094 fail:
 3095         dput(dentry);
 3096         dentry = ERR_PTR(error);
 3097 unlock:
 3098         mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 3099         if (!err2)
 3100                 mnt_drop_write(nd.path.mnt);
 3101 out:
 3102         path_put(&nd.path);
 3103         return dentry;
 3104 }
 3105 EXPORT_SYMBOL(kern_path_create);
 3106 
 3107 void done_path_create(struct path *path, struct dentry *dentry)
 3108 {
 3109         dput(dentry);
 3110         mutex_unlock(&path->dentry->d_inode->i_mutex);
 3111         mnt_drop_write(path->mnt);
 3112         path_put(path);
 3113 }
 3114 EXPORT_SYMBOL(done_path_create);
 3115 
 3116 struct dentry *user_path_create(int dfd, const char __user *pathname,
 3117                                 struct path *path, unsigned int lookup_flags)
 3118 {
 3119         struct filename *tmp = getname(pathname);
 3120         struct dentry *res;
 3121         if (IS_ERR(tmp))
 3122                 return ERR_CAST(tmp);
 3123         res = kern_path_create(dfd, tmp->name, path, lookup_flags);
 3124         putname(tmp);
 3125         return res;
 3126 }
 3127 EXPORT_SYMBOL(user_path_create);
 3128 
 3129 int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 3130 {
 3131         int error = may_create(dir, dentry);
 3132 
 3133         if (error)
 3134                 return error;
 3135 
 3136         if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
 3137                 return -EPERM;
 3138 
 3139         if (!dir->i_op->mknod)
 3140                 return -EPERM;
 3141 
 3142         error = devcgroup_inode_mknod(mode, dev);
 3143         if (error)
 3144                 return error;
 3145 
 3146         error = security_inode_mknod(dir, dentry, mode, dev);
 3147         if (error)
 3148                 return error;
 3149 
 3150         error = dir->i_op->mknod(dir, dentry, mode, dev);
 3151         if (!error)
 3152                 fsnotify_create(dir, dentry);
 3153         return error;
 3154 }
 3155 
 3156 static int may_mknod(umode_t mode)
 3157 {
 3158         switch (mode & S_IFMT) {
 3159         case S_IFREG:
 3160         case S_IFCHR:
 3161         case S_IFBLK:
 3162         case S_IFIFO:
 3163         case S_IFSOCK:
 3164         case 0: /* zero mode translates to S_IFREG */
 3165                 return 0;
 3166         case S_IFDIR:
 3167                 return -EPERM;
 3168         default:
 3169                 return -EINVAL;
 3170         }
 3171 }
 3172 
 3173 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
 3174                 unsigned, dev)
 3175 {
 3176         struct dentry *dentry;
 3177         struct path path;
 3178         int error;
 3179         unsigned int lookup_flags = 0;
 3180 
 3181         error = may_mknod(mode);
 3182         if (error)
 3183                 return error;
 3184 retry:
 3185         dentry = user_path_create(dfd, filename, &path, lookup_flags);
 3186         if (IS_ERR(dentry))
 3187                 return PTR_ERR(dentry);
 3188 
 3189         if (!IS_POSIXACL(path.dentry->d_inode))
 3190                 mode &= ~current_umask();
 3191         error = security_path_mknod(&path, dentry, mode, dev);
 3192         if (error)
 3193                 goto out;
 3194         switch (mode & S_IFMT) {
 3195                 case 0: case S_IFREG:
 3196                         error = vfs_create(path.dentry->d_inode,dentry,mode,true);
 3197                         break;
 3198                 case S_IFCHR: case S_IFBLK:
 3199                         error = vfs_mknod(path.dentry->d_inode,dentry,mode,
 3200                                         new_decode_dev(dev));
 3201                         break;
 3202                 case S_IFIFO: case S_IFSOCK:
 3203                         error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
 3204                         break;
 3205         }
 3206 out:
 3207         done_path_create(&path, dentry);
 3208         if (retry_estale(error, lookup_flags)) {
 3209                 lookup_flags |= LOOKUP_REVAL;
 3210                 goto retry;
 3211         }
 3212         return error;
 3213 }
 3214 
 3215 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
 3216 {
 3217         return sys_mknodat(AT_FDCWD, filename, mode, dev);
 3218 }
 3219 
 3220 int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 3221 {
 3222         int error = may_create(dir, dentry);
 3223         unsigned max_links = dir->i_sb->s_max_links;
 3224 
 3225         if (error)
 3226                 return error;
 3227 
 3228         if (!dir->i_op->mkdir)
 3229                 return -EPERM;
 3230 
 3231         mode &= (S_IRWXUGO|S_ISVTX);
 3232         error = security_inode_mkdir(dir, dentry, mode);
 3233         if (error)
 3234                 return error;
 3235 
 3236         if (max_links && dir->i_nlink >= max_links)
 3237                 return -EMLINK;
 3238 
 3239         error = dir->i_op->mkdir(dir, dentry, mode);
 3240         if (!error)
 3241                 fsnotify_mkdir(dir, dentry);
 3242         return error;
 3243 }
 3244 
 3245 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
 3246 {
 3247         struct dentry *dentry;
 3248         struct path path;
 3249         int error;
 3250         unsigned int lookup_flags = LOOKUP_DIRECTORY;
 3251 
 3252 retry:
 3253         dentry = user_path_create(dfd, pathname, &path, lookup_flags);
 3254         if (IS_ERR(dentry))
 3255                 return PTR_ERR(dentry);
 3256 
 3257         if (!IS_POSIXACL(path.dentry->d_inode))
 3258                 mode &= ~current_umask();
 3259         error = security_path_mkdir(&path, dentry, mode);
 3260         if (!error)
 3261                 error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
 3262         done_path_create(&path, dentry);
 3263         if (retry_estale(error, lookup_flags)) {
 3264                 lookup_flags |= LOOKUP_REVAL;
 3265                 goto retry;
 3266         }
 3267         return error;
 3268 }
 3269 
 3270 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
 3271 {
 3272         return sys_mkdirat(AT_FDCWD, pathname, mode);
 3273 }
 3274 
 3275 /*
 3276  * The dentry_unhash() helper will try to drop the dentry early: we
 3277  * should have a usage count of 1 if we're the only user of this
 3278  * dentry, and if that is true (possibly after pruning the dcache),
 3279  * then we drop the dentry now.
 3280  *
 3281  * A low-level filesystem can, if it choses, legally
 3282  * do a
 3283  *
 3284  *      if (!d_unhashed(dentry))
 3285  *              return -EBUSY;
 3286  *
 3287  * if it cannot handle the case of removing a directory
 3288  * that is still in use by something else..
 3289  */
 3290 void dentry_unhash(struct dentry *dentry)
 3291 {
 3292         shrink_dcache_parent(dentry);
 3293         spin_lock(&dentry->d_lock);
 3294         if (dentry->d_count == 1)
 3295                 __d_drop(dentry);
 3296         spin_unlock(&dentry->d_lock);
 3297 }
 3298 
 3299 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 3300 {
 3301         int error = may_delete(dir, dentry, 1);
 3302 
 3303         if (error)
 3304                 return error;
 3305 
 3306         if (!dir->i_op->rmdir)
 3307                 return -EPERM;
 3308 
 3309         dget(dentry);
 3310         mutex_lock(&dentry->d_inode->i_mutex);
 3311 
 3312         error = -EBUSY;
 3313         if (d_mountpoint(dentry))
 3314                 goto out;
 3315 
 3316         error = security_inode_rmdir(dir, dentry);
 3317         if (error)
 3318                 goto out;
 3319 
 3320         shrink_dcache_parent(dentry);
 3321         error = dir->i_op->rmdir(dir, dentry);
 3322         if (error)
 3323                 goto out;
 3324 
 3325         dentry->d_inode->i_flags |= S_DEAD;
 3326         dont_mount(dentry);
 3327 
 3328 out:
 3329         mutex_unlock(&dentry->d_inode->i_mutex);
 3330         dput(dentry);
 3331         if (!error)
 3332                 d_delete(dentry);
 3333         return error;
 3334 }
 3335 
 3336 static long do_rmdir(int dfd, const char __user *pathname)
 3337 {
 3338         int error = 0;
 3339         struct filename *name;
 3340         struct dentry *dentry;
 3341         struct nameidata nd;
 3342         unsigned int lookup_flags = 0;
 3343 retry:
 3344         name = user_path_parent(dfd, pathname, &nd, lookup_flags);
 3345         if (IS_ERR(name))
 3346                 return PTR_ERR(name);
 3347 
 3348         switch(nd.last_type) {
 3349         case LAST_DOTDOT:
 3350                 error = -ENOTEMPTY;
 3351                 goto exit1;
 3352         case LAST_DOT:
 3353                 error = -EINVAL;
 3354                 goto exit1;
 3355         case LAST_ROOT:
 3356                 error = -EBUSY;
 3357                 goto exit1;
 3358         }
 3359 
 3360         nd.flags &= ~LOOKUP_PARENT;
 3361         error = mnt_want_write(nd.path.mnt);
 3362         if (error)
 3363                 goto exit1;
 3364 
 3365         mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 3366         dentry = lookup_hash(&nd);
 3367         error = PTR_ERR(dentry);
 3368         if (IS_ERR(dentry))
 3369                 goto exit2;
 3370         if (!dentry->d_inode) {
 3371                 error = -ENOENT;
 3372                 goto exit3;
 3373         }
 3374         error = security_path_rmdir(&nd.path, dentry);
 3375         if (error)
 3376                 goto exit3;
 3377         error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
 3378 exit3:
 3379         dput(dentry);
 3380 exit2:
 3381         mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 3382         mnt_drop_write(nd.path.mnt);
 3383 exit1:
 3384         path_put(&nd.path);
 3385         putname(name);
 3386         if (retry_estale(error, lookup_flags)) {
 3387                 lookup_flags |= LOOKUP_REVAL;
 3388                 goto retry;
 3389         }
 3390         return error;
 3391 }
 3392 
 3393 SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 3394 {
 3395         return do_rmdir(AT_FDCWD, pathname);
 3396 }
 3397 
 3398 int vfs_unlink(struct inode *dir, struct dentry *dentry)
 3399 {
 3400         int error = may_delete(dir, dentry, 0);
 3401 
 3402         if (error)
 3403                 return error;
 3404 
 3405         if (!dir->i_op->unlink)
 3406                 return -EPERM;
 3407 
 3408         mutex_lock(&dentry->d_inode->i_mutex);
 3409         if (d_mountpoint(dentry))
 3410                 error = -EBUSY;
 3411         else {
 3412                 error = security_inode_unlink(dir, dentry);
 3413                 if (!error) {
 3414                         error = dir->i_op->unlink(dir, dentry);
 3415                         if (!error)
 3416                                 dont_mount(dentry);
 3417                 }
 3418         }
 3419         mutex_unlock(&dentry->d_inode->i_mutex);
 3420 
 3421         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
 3422         if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
 3423                 fsnotify_link_count(dentry->d_inode);
 3424                 d_delete(dentry);
 3425         }
 3426 
 3427         return error;
 3428 }
 3429 
 3430 /*
 3431  * Make sure that the actual truncation of the file will occur outside its
 3432  * directory's i_mutex.  Truncate can take a long time if there is a lot of
 3433  * writeout happening, and we don't want to prevent access to the directory
 3434  * while waiting on the I/O.
 3435  */
 3436 static long do_unlinkat(int dfd, const char __user *pathname)
 3437 {
 3438         int error;
 3439         struct filename *name;
 3440         struct dentry *dentry;
 3441         struct nameidata nd;
 3442         struct inode *inode = NULL;
 3443         unsigned int lookup_flags = 0;
 3444 retry:
 3445         name = user_path_parent(dfd, pathname, &nd, lookup_flags);
 3446         if (IS_ERR(name))
 3447                 return PTR_ERR(name);
 3448 
 3449         error = -EISDIR;
 3450         if (nd.last_type != LAST_NORM)
 3451                 goto exit1;
 3452 
 3453         nd.flags &= ~LOOKUP_PARENT;
 3454         error = mnt_want_write(nd.path.mnt);
 3455         if (error)
 3456                 goto exit1;
 3457 
 3458         mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 3459         dentry = lookup_hash(&nd);
 3460         error = PTR_ERR(dentry);
 3461         if (!IS_ERR(dentry)) {
 3462                 /* Why not before? Because we want correct error value */
 3463                 if (nd.last.name[nd.last.len])
 3464                         goto slashes;
 3465                 inode = dentry->d_inode;
 3466                 if (!inode)
 3467                         goto slashes;
 3468                 ihold(inode);
 3469                 error = security_path_unlink(&nd.path, dentry);
 3470                 if (error)
 3471                         goto exit2;
 3472                 error = vfs_unlink(nd.path.dentry->d_inode, dentry);
 3473 exit2:
 3474                 dput(dentry);
 3475         }
 3476         mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 3477         if (inode)
 3478                 iput(inode);    /* truncate the inode here */
 3479         mnt_drop_write(nd.path.mnt);
 3480 exit1:
 3481         path_put(&nd.path);
 3482         putname(name);
 3483         if (retry_estale(error, lookup_flags)) {
 3484                 lookup_flags |= LOOKUP_REVAL;
 3485                 inode = NULL;
 3486                 goto retry;
 3487         }
 3488         return error;
 3489 
 3490 slashes:
 3491         error = !dentry->d_inode ? -ENOENT :
 3492                 S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
 3493         goto exit2;
 3494 }
 3495 
 3496 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
 3497 {
 3498         if ((flag & ~AT_REMOVEDIR) != 0)
 3499                 return -EINVAL;
 3500 
 3501         if (flag & AT_REMOVEDIR)
 3502                 return do_rmdir(dfd, pathname);
 3503 
 3504         return do_unlinkat(dfd, pathname);
 3505 }
 3506 
 3507 SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 3508 {
 3509         return do_unlinkat(AT_FDCWD, pathname);
 3510 }
 3511 
 3512 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 3513 {
 3514         int error = may_create(dir, dentry);
 3515 
 3516         if (error)
 3517                 return error;
 3518 
 3519         if (!dir->i_op->symlink)
 3520                 return -EPERM;
 3521 
 3522         error = security_inode_symlink(dir, dentry, oldname);
 3523         if (error)
 3524                 return error;
 3525 
 3526         error = dir->i_op->symlink(dir, dentry, oldname);
 3527         if (!error)
 3528                 fsnotify_create(dir, dentry);
 3529         return error;
 3530 }
 3531 
 3532 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
 3533                 int, newdfd, const char __user *, newname)
 3534 {
 3535         int error;
 3536         struct filename *from;
 3537         struct dentry *dentry;
 3538         struct path path;
 3539         unsigned int lookup_flags = 0;
 3540 
 3541         from = getname(oldname);
 3542         if (IS_ERR(from))
 3543                 return PTR_ERR(from);
 3544 retry:
 3545         dentry = user_path_create(newdfd, newname, &path, lookup_flags);
 3546         error = PTR_ERR(dentry);
 3547         if (IS_ERR(dentry))
 3548                 goto out_putname;
 3549 
 3550         error = security_path_symlink(&path, dentry, from->name);
 3551         if (!error)
 3552                 error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
 3553         done_path_create(&path, dentry);
 3554         if (retry_estale(error, lookup_flags)) {
 3555                 lookup_flags |= LOOKUP_REVAL;
 3556                 goto retry;
 3557         }
 3558 out_putname:
 3559         putname(from);
 3560         return error;
 3561 }
 3562 
 3563 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
 3564 {
 3565         return sys_symlinkat(oldname, AT_FDCWD, newname);
 3566 }
 3567 
 3568 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
 3569 {
 3570         struct inode *inode = old_dentry->d_inode;
 3571         unsigned max_links = dir->i_sb->s_max_links;
 3572         int error;
 3573 
 3574         if (!inode)
 3575                 return -ENOENT;
 3576 
 3577         error = may_create(dir, new_dentry);
 3578         if (error)
 3579                 return error;
 3580 
 3581         if (dir->i_sb != inode->i_sb)
 3582                 return -EXDEV;
 3583 
 3584         /*
 3585          * A link to an append-only or immutable file cannot be created.
 3586          */
 3587         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 3588                 return -EPERM;
 3589         if (!dir->i_op->link)
 3590                 return -EPERM;
 3591         if (S_ISDIR(inode->i_mode))
 3592                 return -EPERM;
 3593 
 3594         error = security_inode_link(old_dentry, dir, new_dentry);
 3595         if (error)
 3596                 return error;
 3597 
 3598         mutex_lock(&inode->i_mutex);
 3599         /* Make sure we don't allow creating hardlink to an unlinked file */
 3600         if (inode->i_nlink == 0)
 3601                 error =  -ENOENT;
 3602         else if (max_links && inode->i_nlink >= max_links)
 3603                 error = -EMLINK;
 3604         else
 3605                 error = dir->i_op->link(old_dentry, dir, new_dentry);
 3606         mutex_unlock(&inode->i_mutex);
 3607         if (!error)
 3608                 fsnotify_link(dir, inode, new_dentry);
 3609         return error;
 3610 }
 3611 
 3612 /*
 3613  * Hardlinks are often used in delicate situations.  We avoid
 3614  * security-related surprises by not following symlinks on the
 3615  * newname.  --KAB
 3616  *
 3617  * We don't follow them on the oldname either to be compatible
 3618  * with linux 2.0, and to avoid hard-linking to directories
 3619  * and other special files.  --ADM
 3620  */
 3621 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 3622                 int, newdfd, const char __user *, newname, int, flags)
 3623 {
 3624         struct dentry *new_dentry;
 3625         struct path old_path, new_path;
 3626         int how = 0;
 3627         int error;
 3628 
 3629         if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
 3630                 return -EINVAL;
 3631         /*
 3632          * To use null names we require CAP_DAC_READ_SEARCH
 3633          * This ensures that not everyone will be able to create
 3634          * handlink using the passed filedescriptor.
 3635          */
 3636         if (flags & AT_EMPTY_PATH) {
 3637                 if (!capable(CAP_DAC_READ_SEARCH))
 3638                         return -ENOENT;
 3639                 how = LOOKUP_EMPTY;
 3640         }
 3641 
 3642         if (flags & AT_SYMLINK_FOLLOW)
 3643                 how |= LOOKUP_FOLLOW;
 3644 retry:
 3645         error = user_path_at(olddfd, oldname, how, &old_path);
 3646         if (error)
 3647                 return error;
 3648 
 3649         new_dentry = user_path_create(newdfd, newname, &new_path,
 3650                                         (how & LOOKUP_REVAL));
 3651         error = PTR_ERR(new_dentry);
 3652         if (IS_ERR(new_dentry))
 3653                 goto out;
 3654 
 3655         error = -EXDEV;
 3656         if (old_path.mnt != new_path.mnt)
 3657                 goto out_dput;
 3658         error = may_linkat(&old_path);
 3659         if (unlikely(error))
 3660                 goto out_dput;
 3661         error = security_path_link(old_path.dentry, &new_path, new_dentry);
 3662         if (error)
 3663                 goto out_dput;
 3664         error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
 3665 out_dput:
 3666         done_path_create(&new_path, new_dentry);
 3667         if (retry_estale(error, how)) {
 3668                 how |= LOOKUP_REVAL;
 3669                 goto retry;
 3670         }
 3671 out:
 3672         path_put(&old_path);
 3673 
 3674         return error;
 3675 }
 3676 
 3677 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
 3678 {
 3679         return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 3680 }
 3681 
 3682 /*
 3683  * The worst of all namespace operations - renaming directory. "Perverted"
 3684  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 3685  * Problems:
 3686  *      a) we can get into loop creation. Check is done in is_subdir().
 3687  *      b) race potential - two innocent renames can create a loop together.
 3688  *         That's where 4.4 screws up. Current fix: serialization on
 3689  *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
 3690  *         story.
 3691  *      c) we have to lock _three_ objects - parents and victim (if it exists).
 3692  *         And that - after we got ->i_mutex on parents (until then we don't know
 3693  *         whether the target exists).  Solution: try to be smart with locking
 3694  *         order for inodes.  We rely on the fact that tree topology may change
 3695  *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
 3696  *         move will be locked.  Thus we can rank directories by the tree
 3697  *         (ancestors first) and rank all non-directories after them.
 3698  *         That works since everybody except rename does "lock parent, lookup,
 3699  *         lock child" and rename is under ->s_vfs_rename_mutex.
 3700  *         HOWEVER, it relies on the assumption that any object with ->lookup()
 3701  *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
 3702  *         we'd better make sure that there's no link(2) for them.
 3703  *      d) conversion from fhandle to dentry may come in the wrong moment - when
 3704  *         we are removing the target. Solution: we will have to grab ->i_mutex
 3705  *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 3706  *         ->i_mutex on parents, which works but leads to some truly excessive
 3707  *         locking].
 3708  */
 3709 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 3710                           struct inode *new_dir, struct dentry *new_dentry)
 3711 {
 3712         int error = 0;
 3713         struct inode *target = new_dentry->d_inode;
 3714         unsigned max_links = new_dir->i_sb->s_max_links;
 3715 
 3716         /*
 3717          * If we are going to change the parent - check write permissions,
 3718          * we'll need to flip '..'.
 3719          */
 3720         if (new_dir != old_dir) {
 3721                 error = inode_permission(old_dentry->d_inode, MAY_WRITE);
 3722                 if (error)
 3723                         return error;
 3724         }
 3725 
 3726         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
 3727         if (error)
 3728                 return error;
 3729 
 3730         dget(new_dentry);
 3731         if (target)
 3732                 mutex_lock(&target->i_mutex);
 3733 
 3734         error = -EBUSY;
 3735         if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
 3736                 goto out;
 3737 
 3738         error = -EMLINK;
 3739         if (max_links && !target && new_dir != old_dir &&
 3740             new_dir->i_nlink >= max_links)
 3741                 goto out;
 3742 
 3743         if (target)
 3744                 shrink_dcache_parent(new_dentry);
 3745         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 3746         if (error)
 3747                 goto out;
 3748 
 3749         if (target) {
 3750                 target->i_flags |= S_DEAD;
 3751                 dont_mount(new_dentry);
 3752         }
 3753 out:
 3754         if (target)
 3755                 mutex_unlock(&target->i_mutex);
 3756         dput(new_dentry);
 3757         if (!error)
 3758                 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 3759                         d_move(old_dentry,new_dentry);
 3760         return error;
 3761 }
 3762 
 3763 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 3764                             struct inode *new_dir, struct dentry *new_dentry)
 3765 {
 3766         struct inode *target = new_dentry->d_inode;
 3767         int error;
 3768 
 3769         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
 3770         if (error)
 3771                 return error;
 3772 
 3773         dget(new_dentry);
 3774         if (target)
 3775                 mutex_lock(&target->i_mutex);
 3776 
 3777         error = -EBUSY;
 3778         if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
 3779                 goto out;
 3780 
 3781         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 3782         if (error)
 3783                 goto out;
 3784 
 3785         if (target)
 3786                 dont_mount(new_dentry);
 3787         if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 3788                 d_move(old_dentry, new_dentry);
 3789 out:
 3790         if (target)
 3791                 mutex_unlock(&target->i_mutex);
 3792         dput(new_dentry);
 3793         return error;
 3794 }
 3795 
 3796 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 3797                struct inode *new_dir, struct dentry *new_dentry)
 3798 {
 3799         int error;
 3800         int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
 3801         const unsigned char *old_name;
 3802 
 3803         if (old_dentry->d_inode == new_dentry->d_inode)
 3804                 return 0;
 3805  
 3806         error = may_delete(old_dir, old_dentry, is_dir);
 3807         if (error)
 3808                 return error;
 3809 
 3810         if (!new_dentry->d_inode)
 3811                 error = may_create(new_dir, new_dentry);
 3812         else
 3813                 error = may_delete(new_dir, new_dentry, is_dir);
 3814         if (error)
 3815                 return error;
 3816 
 3817         if (!old_dir->i_op->rename)
 3818                 return -EPERM;
 3819 
 3820         old_name = fsnotify_oldname_init(old_dentry->d_name.name);
 3821 
 3822         if (is_dir)
 3823                 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
 3824         else
 3825                 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
 3826         if (!error)
 3827                 fsnotify_move(old_dir, new_dir, old_name, is_dir,
 3828                               new_dentry->d_inode, old_dentry);
 3829         fsnotify_oldname_free(old_name);
 3830 
 3831         return error;
 3832 }
 3833 
 3834 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
 3835                 int, newdfd, const char __user *, newname)
 3836 {
 3837         struct dentry *old_dir, *new_dir;
 3838         struct dentry *old_dentry, *new_dentry;
 3839         struct dentry *trap;
 3840         struct nameidata oldnd, newnd;
 3841         struct filename *from;
 3842         struct filename *to;
 3843         unsigned int lookup_flags = 0;
 3844         bool should_retry = false;
 3845         int error;
 3846 retry:
 3847         from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
 3848         if (IS_ERR(from)) {
 3849                 error = PTR_ERR(from);
 3850                 goto exit;
 3851         }
 3852 
 3853         to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
 3854         if (IS_ERR(to)) {
 3855                 error = PTR_ERR(to);
 3856                 goto exit1;
 3857         }
 3858 
 3859         error = -EXDEV;
 3860         if (oldnd.path.mnt != newnd.path.mnt)
 3861                 goto exit2;
 3862 
 3863         old_dir = oldnd.path.dentry;
 3864         error = -EBUSY;
 3865         if (oldnd.last_type != LAST_NORM)
 3866                 goto exit2;
 3867 
 3868         new_dir = newnd.path.dentry;
 3869         if (newnd.last_type != LAST_NORM)
 3870                 goto exit2;
 3871 
 3872         error = mnt_want_write(oldnd.path.mnt);
 3873         if (error)
 3874                 goto exit2;
 3875 
 3876         oldnd.flags &= ~LOOKUP_PARENT;
 3877         newnd.flags &= ~LOOKUP_PARENT;
 3878         newnd.flags |= LOOKUP_RENAME_TARGET;
 3879 
 3880         trap = lock_rename(new_dir, old_dir);
 3881 
 3882         old_dentry = lookup_hash(&oldnd);
 3883         error = PTR_ERR(old_dentry);
 3884         if (IS_ERR(old_dentry))
 3885                 goto exit3;
 3886         /* source must exist */
 3887         error = -ENOENT;
 3888         if (!old_dentry->d_inode)
 3889                 goto exit4;
 3890         /* unless the source is a directory trailing slashes give -ENOTDIR */
 3891         if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
 3892                 error = -ENOTDIR;
 3893                 if (oldnd.last.name[oldnd.last.len])
 3894                         goto exit4;
 3895                 if (newnd.last.name[newnd.last.len])
 3896                         goto exit4;
 3897         }
 3898         /* source should not be ancestor of target */
 3899         error = -EINVAL;
 3900         if (old_dentry == trap)
 3901                 goto exit4;
 3902         new_dentry = lookup_hash(&newnd);
 3903         error = PTR_ERR(new_dentry);
 3904         if (IS_ERR(new_dentry))
 3905                 goto exit4;
 3906         /* target should not be an ancestor of source */
 3907         error = -ENOTEMPTY;
 3908         if (new_dentry == trap)
 3909                 goto exit5;
 3910 
 3911         error = security_path_rename(&oldnd.path, old_dentry,
 3912                                      &newnd.path, new_dentry);
 3913         if (error)
 3914                 goto exit5;
 3915         error = vfs_rename(old_dir->d_inode, old_dentry,
 3916                                    new_dir->d_inode, new_dentry);
 3917 exit5:
 3918         dput(new_dentry);
 3919 exit4:
 3920         dput(old_dentry);
 3921 exit3:
 3922         unlock_rename(new_dir, old_dir);
 3923         mnt_drop_write(oldnd.path.mnt);
 3924 exit2:
 3925         if (retry_estale(error, lookup_flags))
 3926                 should_retry = true;
 3927         path_put(&newnd.path);
 3928         putname(to);
 3929 exit1:
 3930         path_put(&oldnd.path);
 3931         putname(from);
 3932         if (should_retry) {
 3933                 should_retry = false;
 3934                 lookup_flags |= LOOKUP_REVAL;
 3935                 goto retry;
 3936         }
 3937 exit:
 3938         return error;
 3939 }
 3940 
 3941 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
 3942 {
 3943         return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
 3944 }
 3945 
 3946 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
 3947 {
 3948         int len;
 3949 
 3950         len = PTR_ERR(link);
 3951         if (IS_ERR(link))
 3952                 goto out;
 3953 
 3954         len = strlen(link);
 3955         if (len > (unsigned) buflen)
 3956                 len = buflen;
 3957         if (copy_to_user(buffer, link, len))
 3958                 len = -EFAULT;
 3959 out:
 3960         return len;
 3961 }
 3962 
 3963 /*
 3964  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
 3965  * have ->follow_link() touching nd only in nd_set_link().  Using (or not
 3966  * using) it for any given inode is up to filesystem.
 3967  */
 3968 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 3969 {
 3970         struct nameidata nd;
 3971         void *cookie;
 3972         int res;
 3973 
 3974         nd.depth = 0;
 3975         cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
 3976         if (IS_ERR(cookie))
 3977                 return PTR_ERR(cookie);
 3978 
 3979         res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
 3980         if (dentry->d_inode->i_op->put_link)
 3981                 dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
 3982         return res;
 3983 }
 3984 
 3985 int vfs_follow_link(struct nameidata *nd, const char *link)
 3986 {
 3987         return __vfs_follow_link(nd, link);
 3988 }
 3989 
 3990 /* get the link contents into pagecache */
 3991 static char *page_getlink(struct dentry * dentry, struct page **ppage)
 3992 {
 3993         char *kaddr;
 3994         struct page *page;
 3995         struct address_space *mapping = dentry->d_inode->i_mapping;
 3996         page = read_mapping_page(mapping, 0, NULL);
 3997         if (IS_ERR(page))
 3998                 return (char*)page;
 3999         *ppage = page;
 4000         kaddr = kmap(page);
 4001         nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
 4002         return kaddr;
 4003 }
 4004 
 4005 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 4006 {
 4007         struct page *page = NULL;
 4008         char *s = page_getlink(dentry, &page);
 4009         int res = vfs_readlink(dentry,buffer,buflen,s);
 4010         if (page) {
 4011                 kunmap(page);
 4012                 page_cache_release(page);
 4013         }
 4014         return res;
 4015 }
 4016 
 4017 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
 4018 {
 4019         struct page *page = NULL;
 4020         nd_set_link(nd, page_getlink(dentry, &page));
 4021         return page;
 4022 }
 4023 
 4024 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 4025 {
 4026         struct page *page = cookie;
 4027 
 4028         if (page) {
 4029                 kunmap(page);
 4030                 page_cache_release(page);
 4031         }
 4032 }
 4033 
 4034 /*
 4035  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 4036  */
 4037 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 4038 {
 4039         struct address_space *mapping = inode->i_mapping;
 4040         struct page *page;
 4041         void *fsdata;
 4042         int err;
 4043         char *kaddr;
 4044         unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
 4045         if (nofs)
 4046                 flags |= AOP_FLAG_NOFS;
 4047 
 4048 retry:
 4049         err = pagecache_write_begin(NULL, mapping, 0, len-1,
 4050                                 flags, &page, &fsdata);
 4051         if (err)
 4052                 goto fail;
 4053 
 4054         kaddr = kmap_atomic(page);
 4055         memcpy(kaddr, symname, len-1);
 4056         kunmap_atomic(kaddr);
 4057 
 4058         err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
 4059                                                         page, fsdata);
 4060         if (err < 0)
 4061                 goto fail;
 4062         if (err < len-1)
 4063                 goto retry;
 4064 
 4065         mark_inode_dirty(inode);
 4066         return 0;
 4067 fail:
 4068         return err;
 4069 }
 4070 
 4071 int page_symlink(struct inode *inode, const char *symname, int len)
 4072 {
 4073         return __page_symlink(inode, symname, len,
 4074                         !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 4075 }
 4076 
 4077 const struct inode_operations page_symlink_inode_operations = {
 4078         .readlink       = generic_readlink,
 4079         .follow_link    = page_follow_link_light,
 4080         .put_link       = page_put_link,
 4081 };
 4082 
 4083 EXPORT_SYMBOL(user_path_at);
 4084 EXPORT_SYMBOL(follow_down_one);
 4085 EXPORT_SYMBOL(follow_down);
 4086 EXPORT_SYMBOL(follow_up);
 4087 EXPORT_SYMBOL(get_write_access); /* nfsd */
 4088 EXPORT_SYMBOL(lock_rename);
 4089 EXPORT_SYMBOL(lookup_one_len);
 4090 EXPORT_SYMBOL(page_follow_link_light);
 4091 EXPORT_SYMBOL(page_put_link);
 4092 EXPORT_SYMBOL(page_readlink);
 4093 EXPORT_SYMBOL(__page_symlink);
 4094 EXPORT_SYMBOL(page_symlink);
 4095 EXPORT_SYMBOL(page_symlink_inode_operations);
 4096 EXPORT_SYMBOL(kern_path);
 4097 EXPORT_SYMBOL(vfs_path_lookup);
 4098 EXPORT_SYMBOL(inode_permission);
 4099 EXPORT_SYMBOL(unlock_rename);
 4100 EXPORT_SYMBOL(vfs_create);
 4101 EXPORT_SYMBOL(vfs_follow_link);
 4102 EXPORT_SYMBOL(vfs_link);
 4103 EXPORT_SYMBOL(vfs_mkdir);
 4104 EXPORT_SYMBOL(vfs_mknod);
 4105 EXPORT_SYMBOL(generic_permission);
 4106 EXPORT_SYMBOL(vfs_readlink);
 4107 EXPORT_SYMBOL(vfs_rename);
 4108 EXPORT_SYMBOL(vfs_rmdir);
 4109 EXPORT_SYMBOL(vfs_symlink);
 4110 EXPORT_SYMBOL(vfs_unlink);
 4111 EXPORT_SYMBOL(dentry_unhash);
 4112 EXPORT_SYMBOL(generic_readlink);
Cache object: c083fe398e943c58c5cda74707aabecf
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/fs/namei.c

FreeBSD/Linux Kernel Cross Reference
sys/fs/namei.c