LCOV - landlock.info

LCOV - code coverage report

Current view:	top level - fs - namei.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	1716	2314	74.2 %
Date:	2021-04-22 12:43:58	Functions:	117	157	74.5 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *  linux/fs/namei.c
       4             :  *
       5             :  *  Copyright (C) 1991, 1992  Linus Torvalds
       6             :  */
       7             : 
       8             : /*
       9             :  * Some corrections by tytso.
      10             :  */
      11             : 
      12             : /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
      13             :  * lookup logic.
      14             :  */
      15             : /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
      16             :  */
      17             : 
      18             : #include <linux/init.h>
      19             : #include <linux/export.h>
      20             : #include <linux/kernel.h>
      21             : #include <linux/slab.h>
      22             : #include <linux/fs.h>
      23             : #include <linux/namei.h>
      24             : #include <linux/pagemap.h>
      25             : #include <linux/fsnotify.h>
      26             : #include <linux/personality.h>
      27             : #include <linux/security.h>
      28             : #include <linux/ima.h>
      29             : #include <linux/syscalls.h>
      30             : #include <linux/mount.h>
      31             : #include <linux/audit.h>
      32             : #include <linux/capability.h>
      33             : #include <linux/file.h>
      34             : #include <linux/fcntl.h>
      35             : #include <linux/device_cgroup.h>
      36             : #include <linux/fs_struct.h>
      37             : #include <linux/posix_acl.h>
      38             : #include <linux/hash.h>
      39             : #include <linux/bitops.h>
      40             : #include <linux/init_task.h>
      41             : #include <linux/uaccess.h>
      42             : 
      43             : #include "internal.h"
      44             : #include "mount.h"
      45             : 
      46             : /* [Feb-1997 T. Schoebel-Theuer]
      47             :  * Fundamental changes in the pathname lookup mechanisms (namei)
      48             :  * were necessary because of omirr.  The reason is that omirr needs
      49             :  * to know the _real_ pathname, not the user-supplied one, in case
      50             :  * of symlinks (and also when transname replacements occur).
      51             :  *
      52             :  * The new code replaces the old recursive symlink resolution with
      53             :  * an iterative one (in case of non-nested symlink chains).  It does
      54             :  * this with calls to <fs>_follow_link().
      55             :  * As a side effect, dir_namei(), _namei() and follow_link() are now 
      56             :  * replaced with a single function lookup_dentry() that can handle all 
      57             :  * the special cases of the former code.
      58             :  *
      59             :  * With the new dcache, the pathname is stored at each inode, at least as
      60             :  * long as the refcount of the inode is positive.  As a side effect, the
      61             :  * size of the dcache depends on the inode cache and thus is dynamic.
      62             :  *
      63             :  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
      64             :  * resolution to correspond with current state of the code.
      65             :  *
      66             :  * Note that the symlink resolution is not *completely* iterative.
      67             :  * There is still a significant amount of tail- and mid- recursion in
      68             :  * the algorithm.  Also, note that <fs>_readlink() is not used in
      69             :  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
      70             :  * may return different results than <fs>_follow_link().  Many virtual
      71             :  * filesystems (including /proc) exhibit this behavior.
      72             :  */
      73             : 
      74             : /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
      75             :  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
      76             :  * and the name already exists in form of a symlink, try to create the new
      77             :  * name indicated by the symlink. The old code always complained that the
      78             :  * name already exists, due to not following the symlink even if its target
      79             :  * is nonexistent.  The new semantics affects also mknod() and link() when
      80             :  * the name is a symlink pointing to a non-existent name.
      81             :  *
      82             :  * I don't know which semantics is the right one, since I have no access
      83             :  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
      84             :  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
      85             :  * "old" one. Personally, I think the new semantics is much more logical.
      86             :  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
      87             :  * file does succeed in both HP-UX and SunOs, but not in Solaris
      88             :  * and in the old Linux semantics.
      89             :  */
      90             : 
      91             : /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
      92             :  * semantics.  See the comments in "open_namei" and "do_link" below.
      93             :  *
      94             :  * [10-Sep-98 Alan Modra] Another symlink change.
      95             :  */
      96             : 
      97             : /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
      98             :  *      inside the path - always follow.
      99             :  *      in the last component in creation/removal/renaming - never follow.
     100             :  *      if LOOKUP_FOLLOW passed - follow.
     101             :  *      if the pathname has trailing slashes - follow.
     102             :  *      otherwise - don't follow.
     103             :  * (applied in that order).
     104             :  *
     105             :  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
     106             :  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
     107             :  * During the 2.4 we need to fix the userland stuff depending on it -
     108             :  * hopefully we will be able to get rid of that wart in 2.5. So far only
     109             :  * XEmacs seems to be relying on it...
     110             :  */
     111             : /*
     112             :  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
     113             :  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
     114             :  * any extra contention...
     115             :  */
     116             : 
     117             : /* In order to reduce some races, while at the same time doing additional
     118             :  * checking and hopefully speeding things up, we copy filenames to the
     119             :  * kernel data space before using them..
     120             :  *
     121             :  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
     122             :  * PATH_MAX includes the nul terminator --RR.
     123             :  */
     124             : 
     125             : #define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
     126             : 
     127             : struct filename *
     128       89142 : getname_flags(const char __user *filename, int flags, int *empty)
     129             : {
     130       89142 :         struct filename *result;
     131       89142 :         char *kname;
     132       89142 :         int len;
     133             : 
     134       89142 :         result = audit_reusename(filename);
     135       89142 :         if (result)
     136             :                 return result;
     137             : 
     138       89142 :         result = __getname();
     139       89223 :         if (unlikely(!result))
     140       89211 :                 return ERR_PTR(-ENOMEM);
     141             : 
     142             :         /*
     143             :          * First, try to embed the struct filename inside the names_cache
     144             :          * allocation
     145             :          */
     146       89223 :         kname = (char *)result->iname;
     147       89223 :         result->name = kname;
     148             : 
     149       89223 :         len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
     150       89211 :         if (unlikely(len < 0)) {
     151           0 :                 __putname(result);
     152           0 :                 return ERR_PTR(len);
     153             :         }
     154             : 
     155             :         /*
     156             :          * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
     157             :          * separate struct filename so we can dedicate the entire
     158             :          * names_cache allocation for the pathname, and re-do the copy from
     159             :          * userland.
     160             :          */
     161       89211 :         if (unlikely(len == EMBEDDED_NAME_MAX)) {
     162           0 :                 const size_t size = offsetof(struct filename, iname[1]);
     163           0 :                 kname = (char *)result;
     164             : 
     165             :                 /*
     166             :                  * size is chosen that way we to guarantee that
     167             :                  * result->iname[0] is within the same object and that
     168             :                  * kname can't be equal to result->iname, no matter what.
     169             :                  */
     170           0 :                 result = kzalloc(size, GFP_KERNEL);
     171           0 :                 if (unlikely(!result)) {
     172           0 :                         __putname(kname);
     173           0 :                         return ERR_PTR(-ENOMEM);
     174             :                 }
     175           0 :                 result->name = kname;
     176           0 :                 len = strncpy_from_user(kname, filename, PATH_MAX);
     177           0 :                 if (unlikely(len < 0)) {
     178           0 :                         __putname(kname);
     179           0 :                         kfree(result);
     180           0 :                         return ERR_PTR(len);
     181             :                 }
     182           0 :                 if (unlikely(len == PATH_MAX)) {
     183           0 :                         __putname(kname);
     184           0 :                         kfree(result);
     185           0 :                         return ERR_PTR(-ENAMETOOLONG);
     186             :                 }
     187             :         }
     188             : 
     189       89211 :         result->refcnt = 1;
     190             :         /* The empty path is special. */
     191       89211 :         if (unlikely(!len)) {
     192          34 :                 if (empty)
     193           0 :                         *empty = 1;
     194          34 :                 if (!(flags & LOOKUP_EMPTY)) {
     195           0 :                         putname(result);
     196           0 :                         return ERR_PTR(-ENOENT);
     197             :                 }
     198             :         }
     199             : 
     200       89211 :         result->uptr = filename;
     201       89211 :         result->aname = NULL;
     202       89211 :         audit_getname(result);
     203       89211 :         return result;
     204             : }
     205             : 
     206             : struct filename *
     207       67038 : getname(const char __user * filename)
     208             : {
     209       59097 :         return getname_flags(filename, 0, NULL);
     210             : }
     211             : 
     212             : struct filename *
     213        1874 : getname_kernel(const char * filename)
     214             : {
     215        1874 :         struct filename *result;
     216        1874 :         int len = strlen(filename) + 1;
     217             : 
     218        1874 :         result = __getname();
     219        1874 :         if (unlikely(!result))
     220        1874 :                 return ERR_PTR(-ENOMEM);
     221             : 
     222        1874 :         if (len <= EMBEDDED_NAME_MAX) {
     223        1874 :                 result->name = (char *)result->iname;
     224           0 :         } else if (len <= PATH_MAX) {
     225           0 :                 const size_t size = offsetof(struct filename, iname[1]);
     226           0 :                 struct filename *tmp;
     227             : 
     228           0 :                 tmp = kmalloc(size, GFP_KERNEL);
     229           0 :                 if (unlikely(!tmp)) {
     230           0 :                         __putname(result);
     231           0 :                         return ERR_PTR(-ENOMEM);
     232             :                 }
     233           0 :                 tmp->name = (char *)result;
     234           0 :                 result = tmp;
     235             :         } else {
     236           0 :                 __putname(result);
     237           0 :                 return ERR_PTR(-ENAMETOOLONG);
     238             :         }
     239        1874 :         memcpy((char *)result->name, filename, len);
     240        1874 :         result->uptr = NULL;
     241        1874 :         result->aname = NULL;
     242        1874 :         result->refcnt = 1;
     243        1874 :         audit_getname(result);
     244             : 
     245        1874 :         return result;
     246             : }
     247             : 
     248       91078 : void putname(struct filename *name)
     249             : {
     250       91078 :         BUG_ON(name->refcnt <= 0);
     251             : 
     252       91078 :         if (--name->refcnt > 0)
     253             :                 return;
     254             : 
     255       91078 :         if (name->name != name->iname) {
     256           0 :                 __putname(name->name);
     257           0 :                 kfree(name);
     258             :         } else
     259       91078 :                 __putname(name);
     260             : }
     261             : 
     262             : /**
     263             :  * check_acl - perform ACL permission checking
     264             :  * @mnt_userns: user namespace of the mount the inode was found from
     265             :  * @inode:      inode to check permissions on
     266             :  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
     267             :  *
     268             :  * This function performs the ACL permission checking. Since this function
     269             :  * retrieve POSIX acls it needs to know whether it is called from a blocking or
     270             :  * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
     271             :  *
     272             :  * If the inode has been found through an idmapped mount the user namespace of
     273             :  * the vfsmount must be passed through @mnt_userns. This function will then take
     274             :  * care to map the inode according to @mnt_userns before checking permissions.
     275             :  * On non-idmapped mounts or if permission checking is to be performed on the
     276             :  * raw inode simply passs init_user_ns.
     277             :  */
     278             : static int check_acl(struct user_namespace *mnt_userns,
     279             :                      struct inode *inode, int mask)
     280             : {
     281             : #ifdef CONFIG_FS_POSIX_ACL
     282             :         struct posix_acl *acl;
     283             : 
     284             :         if (mask & MAY_NOT_BLOCK) {
     285             :                 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
     286             :                 if (!acl)
     287             :                         return -EAGAIN;
     288             :                 /* no ->get_acl() calls in RCU mode... */
     289             :                 if (is_uncached_acl(acl))
     290             :                         return -ECHILD;
     291             :                 return posix_acl_permission(mnt_userns, inode, acl, mask);
     292             :         }
     293             : 
     294             :         acl = get_acl(inode, ACL_TYPE_ACCESS);
     295             :         if (IS_ERR(acl))
     296             :                 return PTR_ERR(acl);
     297             :         if (acl) {
     298             :                 int error = posix_acl_permission(mnt_userns, inode, acl, mask);
     299             :                 posix_acl_release(acl);
     300             :                 return error;
     301             :         }
     302             : #endif
     303             : 
     304             :         return -EAGAIN;
     305             : }
     306             : 
     307             : /**
     308             :  * acl_permission_check - perform basic UNIX permission checking
     309             :  * @mnt_userns: user namespace of the mount the inode was found from
     310             :  * @inode:      inode to check permissions on
     311             :  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
     312             :  *
     313             :  * This function performs the basic UNIX permission checking. Since this
     314             :  * function may retrieve POSIX acls it needs to know whether it is called from a
     315             :  * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
     316             :  *
     317             :  * If the inode has been found through an idmapped mount the user namespace of
     318             :  * the vfsmount must be passed through @mnt_userns. This function will then take
     319             :  * care to map the inode according to @mnt_userns before checking permissions.
     320             :  * On non-idmapped mounts or if permission checking is to be performed on the
     321             :  * raw inode simply passs init_user_ns.
     322             :  */
     323      258181 : static int acl_permission_check(struct user_namespace *mnt_userns,
     324             :                                 struct inode *inode, int mask)
     325             : {
     326      258181 :         unsigned int mode = inode->i_mode;
     327      258181 :         kuid_t i_uid;
     328             : 
     329             :         /* Are we the owner? If so, ACL's don't matter */
     330      258181 :         i_uid = i_uid_into_mnt(mnt_userns, inode);
     331      258181 :         if (likely(uid_eq(current_fsuid(), i_uid))) {
     332      232907 :                 mask &= 7;
     333      232907 :                 mode >>= 6;
     334      232907 :                 return (mask & ~mode) ? -EACCES : 0;
     335             :         }
     336             : 
     337             :         /* Do we have ACL's? */
     338       25286 :         if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
     339       25286 :                 int error = check_acl(mnt_userns, inode, mask);
     340             :                 if (error != -EAGAIN)
     341             :                         return error;
     342             :         }
     343             : 
     344             :         /* Only RWX matters for group/other mode bits */
     345       25286 :         mask &= 7;
     346             : 
     347             :         /*
     348             :          * Are the group permissions different from
     349             :          * the other permissions in the bits we care
     350             :          * about? Need to check group ownership if so.
     351             :          */
     352       25286 :         if (mask & (mode ^ (mode >> 3))) {
     353          78 :                 kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
     354          78 :                 if (in_group_p(kgid))
     355           0 :                         mode >>= 3;
     356             :         }
     357             : 
     358             :         /* Bits in 'mode' clear that we require? */
     359       25286 :         return (mask & ~mode) ? -EACCES : 0;
     360             : }
     361             : 
     362             : /**
     363             :  * generic_permission -  check for access rights on a Posix-like filesystem
     364             :  * @mnt_userns: user namespace of the mount the inode was found from
     365             :  * @inode:      inode to check access rights for
     366             :  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
     367             :  *              %MAY_NOT_BLOCK ...)
     368             :  *
     369             :  * Used to check for read/write/execute permissions on a file.
     370             :  * We use "fsuid" for this, letting us set arbitrary permissions
     371             :  * for filesystem access without changing the "normal" uids which
     372             :  * are used for other things.
     373             :  *
     374             :  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
     375             :  * request cannot be satisfied (eg. requires blocking or too much complexity).
     376             :  * It would then be called again in ref-walk mode.
     377             :  *
     378             :  * If the inode has been found through an idmapped mount the user namespace of
     379             :  * the vfsmount must be passed through @mnt_userns. This function will then take
     380             :  * care to map the inode according to @mnt_userns before checking permissions.
     381             :  * On non-idmapped mounts or if permission checking is to be performed on the
     382             :  * raw inode simply passs init_user_ns.
     383             :  */
     384      258180 : int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
     385             :                        int mask)
     386             : {
     387      258180 :         int ret;
     388             : 
     389             :         /*
     390             :          * Do the basic permission checks.
     391             :          */
     392      258180 :         ret = acl_permission_check(mnt_userns, inode, mask);
     393      258193 :         if (ret != -EACCES)
     394             :                 return ret;
     395             : 
     396         209 :         if (S_ISDIR(inode->i_mode)) {
     397             :                 /* DACs are overridable for directories */
     398         139 :                 if (!(mask & MAY_WRITE))
     399          68 :                         if (capable_wrt_inode_uidgid(mnt_userns, inode,
     400             :                                                      CAP_DAC_READ_SEARCH))
     401             :                                 return 0;
     402          78 :                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
     403             :                                              CAP_DAC_OVERRIDE))
     404             :                         return 0;
     405           0 :                 return -EACCES;
     406             :         }
     407             : 
     408             :         /*
     409             :          * Searching includes executable on directories, else just read.
     410             :          */
     411          70 :         mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
     412          70 :         if (mask == MAY_READ)
     413          51 :                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
     414             :                                              CAP_DAC_READ_SEARCH))
     415             :                         return 0;
     416             :         /*
     417             :          * Read/write DACs are always overridable.
     418             :          * Executable DACs are overridable when there is
     419             :          * at least one exec bit set.
     420             :          */
     421          20 :         if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
     422          10 :                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
     423             :                                              CAP_DAC_OVERRIDE))
     424           3 :                         return 0;
     425             : 
     426             :         return -EACCES;
     427             : }
     428             : EXPORT_SYMBOL(generic_permission);
     429             : 
     430             : /**
     431             :  * do_inode_permission - UNIX permission checking
     432             :  * @mnt_userns: user namespace of the mount the inode was found from
     433             :  * @inode:      inode to check permissions on
     434             :  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
     435             :  *
     436             :  * We _really_ want to just do "generic_permission()" without
     437             :  * even looking at the inode->i_op values. So we keep a cache
     438             :  * flag in inode->i_opflags, that says "this has not special
     439             :  * permission function, use the fast case".
     440             :  */
     441      267836 : static inline int do_inode_permission(struct user_namespace *mnt_userns,
     442             :                                       struct inode *inode, int mask)
     443             : {
     444      267836 :         if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
     445       41528 :                 if (likely(inode->i_op->permission))
     446       37764 :                         return inode->i_op->permission(mnt_userns, inode, mask);
     447             : 
     448             :                 /* This gets set once for the inode lifetime */
     449        3764 :                 spin_lock(&inode->i_lock);
     450        3764 :                 inode->i_opflags |= IOP_FASTPERM;
     451        3764 :                 spin_unlock(&inode->i_lock);
     452             :         }
     453      230072 :         return generic_permission(mnt_userns, inode, mask);
     454             : }
     455             : 
     456             : /**
     457             :  * sb_permission - Check superblock-level permissions
     458             :  * @sb: Superblock of inode to check permission on
     459             :  * @inode: Inode to check permission on
     460             :  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
     461             :  *
     462             :  * Separate out file-system wide checks from inode-specific permission checks.
     463             :  */
     464      267841 : static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
     465             : {
     466      267841 :         if (unlikely(mask & MAY_WRITE)) {
     467        6495 :                 umode_t mode = inode->i_mode;
     468             : 
     469             :                 /* Nobody gets write access to a read-only fs. */
     470        6495 :                 if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
     471           1 :                         return -EROFS;
     472             :         }
     473             :         return 0;
     474             : }
     475             : 
     476             : /**
     477             :  * inode_permission - Check for access rights to a given inode
     478             :  * @mnt_userns: User namespace of the mount the inode was found from
     479             :  * @inode:      Inode to check permission on
     480             :  * @mask:       Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
     481             :  *
     482             :  * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
     483             :  * this, letting us set arbitrary permissions for filesystem access without
     484             :  * changing the "normal" UIDs which are used for other things.
     485             :  *
     486             :  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
     487             :  */
     488      267838 : int inode_permission(struct user_namespace *mnt_userns,
     489             :                      struct inode *inode, int mask)
     490             : {
     491      267838 :         int retval;
     492             : 
     493      267838 :         retval = sb_permission(inode->i_sb, inode, mask);
     494      267838 :         if (retval)
     495             :                 return retval;
     496             : 
     497      267837 :         if (unlikely(mask & MAY_WRITE)) {
     498             :                 /*
     499             :                  * Nobody gets write access to an immutable file.
     500             :                  */
     501        6494 :                 if (IS_IMMUTABLE(inode))
     502             :                         return -EPERM;
     503             : 
     504             :                 /*
     505             :                  * Updating mtime will likely cause i_uid and i_gid to be
     506             :                  * written back improperly if their true value is unknown
     507             :                  * to the vfs.
     508             :                  */
     509       12980 :                 if (HAS_UNMAPPED_ID(mnt_userns, inode))
     510             :                         return -EACCES;
     511             :         }
     512             : 
     513      267833 :         retval = do_inode_permission(mnt_userns, inode, mask);
     514      267843 :         if (retval)
     515             :                 return retval;
     516             : 
     517      258331 :         retval = devcgroup_inode_permission(inode, mask);
     518      258331 :         if (retval)
     519             :                 return retval;
     520             : 
     521      258331 :         return security_inode_permission(inode, mask);
     522             : }
     523             : EXPORT_SYMBOL(inode_permission);
     524             : 
     525             : /**
     526             :  * path_get - get a reference to a path
     527             :  * @path: path to get the reference to
     528             :  *
     529             :  * Given a path increment the reference count to the dentry and the vfsmount.
     530             :  */
     531       52519 : void path_get(const struct path *path)
     532             : {
     533       52519 :         mntget(path->mnt);
     534       52535 :         dget(path->dentry);
     535       52548 : }
     536             : EXPORT_SYMBOL(path_get);
     537             : 
     538             : /**
     539             :  * path_put - put a reference to a path
     540             :  * @path: path to put the reference to
     541             :  *
     542             :  * Given a path decrement the reference count to the dentry and the vfsmount.
     543             :  */
     544      153828 : void path_put(const struct path *path)
     545             : {
     546      153828 :         dput(path->dentry);
     547      153811 :         mntput(path->mnt);
     548      153824 : }
     549             : EXPORT_SYMBOL(path_put);
     550             : 
     551             : #define EMBEDDED_LEVELS 2
     552             : struct nameidata {
     553             :         struct path     path;
     554             :         struct qstr     last;
     555             :         struct path     root;
     556             :         struct inode    *inode; /* path.dentry.d_inode */
     557             :         unsigned int    flags;
     558             :         unsigned        seq, m_seq, r_seq;
     559             :         int             last_type;
     560             :         unsigned        depth;
     561             :         int             total_link_count;
     562             :         struct saved {
     563             :                 struct path link;
     564             :                 struct delayed_call done;
     565             :                 const char *name;
     566             :                 unsigned seq;
     567             :         } *stack, internal[EMBEDDED_LEVELS];
     568             :         struct filename *name;
     569             :         struct nameidata *saved;
     570             :         unsigned        root_seq;
     571             :         int             dfd;
     572             :         kuid_t          dir_uid;
     573             :         umode_t         dir_mode;
     574             : } __randomize_layout;
     575             : 
     576       90853 : static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
     577             : {
     578       90853 :         struct nameidata *old = current->nameidata;
     579       90853 :         p->stack = p->internal;
     580       90853 :         p->dfd = dfd;
     581       90853 :         p->name = name;
     582       90853 :         p->total_link_count = old ? old->total_link_count : 0;
     583       90853 :         p->saved = old;
     584       90853 :         current->nameidata = p;
     585             : }
     586             : 
     587       90859 : static void restore_nameidata(void)
     588             : {
     589       90859 :         struct nameidata *now = current->nameidata, *old = now->saved;
     590             : 
     591       90859 :         current->nameidata = old;
     592       90859 :         if (old)
     593           0 :                 old->total_link_count = now->total_link_count;
     594       90859 :         if (now->stack != now->internal)
     595           0 :                 kfree(now->stack);
     596       90859 : }
     597             : 
     598           0 : static bool nd_alloc_stack(struct nameidata *nd)
     599             : {
     600           0 :         struct saved *p;
     601             : 
     602           0 :         p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
     603           0 :                          nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
     604           0 :         if (unlikely(!p))
     605             :                 return false;
     606           0 :         memcpy(p, nd->internal, sizeof(nd->internal));
     607           0 :         nd->stack = p;
     608           0 :         return true;
     609             : }
     610             : 
     611             : /**
     612             :  * path_connected - Verify that a dentry is below mnt.mnt_root
     613             :  *
     614             :  * Rename can sometimes move a file or directory outside of a bind
     615             :  * mount, path_connected allows those cases to be detected.
     616             :  */
     617        1163 : static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
     618             : {
     619        1163 :         struct super_block *sb = mnt->mnt_sb;
     620             : 
     621             :         /* Bind mounts can have disconnected paths */
     622        1163 :         if (mnt->mnt_root == sb->s_root)
     623             :                 return true;
     624             : 
     625           0 :         return is_subdir(dentry, mnt->mnt_root);
     626             : }
     627             : 
     628       90934 : static void drop_links(struct nameidata *nd)
     629             : {
     630       90934 :         int i = nd->depth;
     631       90934 :         while (i--) {
     632          12 :                 struct saved *last = nd->stack + i;
     633          12 :                 do_delayed_call(&last->done);
     634       90952 :                 clear_delayed_call(&last->done);
     635             :         }
     636       90940 : }
     637             : 
     638       90933 : static void terminate_walk(struct nameidata *nd)
     639             : {
     640       90933 :         drop_links(nd);
     641       90943 :         if (!(nd->flags & LOOKUP_RCU)) {
     642       77578 :                 int i;
     643       77578 :                 path_put(&nd->path);
     644      155162 :                 for (i = 0; i < nd->depth; i++)
     645           5 :                         path_put(&nd->stack[i].link);
     646       77579 :                 if (nd->flags & LOOKUP_ROOT_GRABBED) {
     647       15701 :                         path_put(&nd->root);
     648       15701 :                         nd->flags &= ~LOOKUP_ROOT_GRABBED;
     649             :                 }
     650             :         } else {
     651       13365 :                 nd->flags &= ~LOOKUP_RCU;
     652       13365 :                 rcu_read_unlock();
     653             :         }
     654       90944 :         nd->depth = 0;
     655       90944 : }
     656             : 
     657             : /* path_put is needed afterwards regardless of success or failure */
     658       91013 : static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
     659             : {
     660       91013 :         int res = __legitimize_mnt(path->mnt, mseq);
     661       91030 :         if (unlikely(res)) {
     662          78 :                 if (res > 0)
     663          76 :                         path->mnt = NULL;
     664          78 :                 path->dentry = NULL;
     665          78 :                 return false;
     666             :         }
     667       90952 :         if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
     668           0 :                 path->dentry = NULL;
     669           0 :                 return false;
     670             :         }
     671       90948 :         return !read_seqcount_retry(&path->dentry->d_seq, seq);
     672             : }
     673             : 
     674       91018 : static inline bool legitimize_path(struct nameidata *nd,
     675             :                             struct path *path, unsigned seq)
     676             : {
     677       91018 :         return __legitimize_path(path, seq, nd->m_seq);
     678             : }
     679             : 
     680       77461 : static bool legitimize_links(struct nameidata *nd)
     681             : {
     682       77461 :         int i;
     683       77461 :         if (unlikely(nd->flags & LOOKUP_CACHED)) {
     684           0 :                 drop_links(nd);
     685           0 :                 nd->depth = 0;
     686           0 :                 return false;
     687             :         }
     688       78118 :         for (i = 0; i < nd->depth; i++) {
     689         665 :                 struct saved *last = nd->stack + i;
     690         665 :                 if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
     691           6 :                         drop_links(nd);
     692           6 :                         nd->depth = i + 1;
     693           6 :                         return false;
     694             :                 }
     695             :         }
     696             :         return true;
     697             : }
     698             : 
     699       77391 : static bool legitimize_root(struct nameidata *nd)
     700             : {
     701             :         /*
     702             :          * For scoped-lookups (where nd->root has been zeroed), we need to
     703             :          * restart the whole lookup from scratch -- because set_root() is wrong
     704             :          * for these lookups (nd->dfd is the root, not the filesystem root).
     705             :          */
     706       77391 :         if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
     707             :                 return false;
     708             :         /* Nothing to do if nd->root is zero or is managed by the VFS user. */
     709       77391 :         if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
     710             :                 return true;
     711       15045 :         nd->flags |= LOOKUP_ROOT_GRABBED;
     712       15045 :         return legitimize_path(nd, &nd->root, nd->root_seq);
     713             : }
     714             : 
     715             : /*
     716             :  * Path walking has 2 modes, rcu-walk and ref-walk (see
     717             :  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
     718             :  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
     719             :  * normal reference counts on dentries and vfsmounts to transition to ref-walk
     720             :  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
     721             :  * got stuck, so ref-walk may continue from there. If this is not successful
     722             :  * (eg. a seqcount has changed), then failure is returned and it's up to caller
     723             :  * to restart the path walk from the beginning in ref-walk mode.
     724             :  */
     725             : 
     726             : /**
     727             :  * try_to_unlazy - try to switch to ref-walk mode.
     728             :  * @nd: nameidata pathwalk data
     729             :  * Returns: true on success, false on failure
     730             :  *
     731             :  * try_to_unlazy attempts to legitimize the current nd->path and nd->root
     732             :  * for ref-walk mode.
     733             :  * Must be called from rcu-walk context.
     734             :  * Nothing should touch nameidata between try_to_unlazy() failure and
     735             :  * terminate_walk().
     736             :  */
     737       75311 : static bool try_to_unlazy(struct nameidata *nd)
     738             : {
     739       75311 :         struct dentry *parent = nd->path.dentry;
     740             : 
     741       75311 :         BUG_ON(!(nd->flags & LOOKUP_RCU));
     742             : 
     743       75311 :         nd->flags &= ~LOOKUP_RCU;
     744       75311 :         if (unlikely(!legitimize_links(nd)))
     745           0 :                 goto out1;
     746       75308 :         if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
     747          68 :                 goto out;
     748       75253 :         if (unlikely(!legitimize_root(nd)))
     749           1 :                 goto out;
     750       75249 :         rcu_read_unlock();
     751       75240 :         BUG_ON(nd->inode != parent->d_inode);
     752             :         return true;
     753             : 
     754           0 : out1:
     755           0 :         nd->path.mnt = NULL;
     756           0 :         nd->path.dentry = NULL;
     757          69 : out:
     758          69 :         rcu_read_unlock();
     759          69 :         return false;
     760             : }
     761             : 
     762             : /**
     763             :  * try_to_unlazy_next - try to switch to ref-walk mode.
     764             :  * @nd: nameidata pathwalk data
     765             :  * @dentry: next dentry to step into
     766             :  * @seq: seq number to check @dentry against
     767             :  * Returns: true on success, false on failure
     768             :  *
     769             :  * Similar to to try_to_unlazy(), but here we have the next dentry already
     770             :  * picked by rcu-walk and want to legitimize that in addition to the current
     771             :  * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
     772             :  * Nothing should touch nameidata between try_to_unlazy_next() failure and
     773             :  * terminate_walk().
     774             :  */
     775        2154 : static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
     776             : {
     777        2154 :         BUG_ON(!(nd->flags & LOOKUP_RCU));
     778             : 
     779        2154 :         nd->flags &= ~LOOKUP_RCU;
     780        2154 :         if (unlikely(!legitimize_links(nd)))
     781           5 :                 goto out2;
     782        2147 :         if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
     783           7 :                 goto out2;
     784        2141 :         if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
     785           0 :                 goto out1;
     786             : 
     787             :         /*
     788             :          * We need to move both the parent and the dentry from the RCU domain
     789             :          * to be properly refcounted. And the sequence number in the dentry
     790             :          * validates *both* dentry counters, since we checked the sequence
     791             :          * number of the parent after we got the child sequence number. So we
     792             :          * know the parent must still be valid if the child sequence number is
     793             :          */
     794        2141 :         if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
     795           0 :                 goto out;
     796        2141 :         if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
     797           0 :                 goto out_dput;
     798             :         /*
     799             :          * Sequence counts matched. Now make sure that the root is
     800             :          * still valid and get it if required.
     801             :          */
     802        2141 :         if (unlikely(!legitimize_root(nd)))
     803           3 :                 goto out_dput;
     804        2138 :         rcu_read_unlock();
     805        2138 :         return true;
     806             : 
     807          12 : out2:
     808          12 :         nd->path.mnt = NULL;
     809          12 : out1:
     810          12 :         nd->path.dentry = NULL;
     811          12 : out:
     812          12 :         rcu_read_unlock();
     813          12 :         return false;
     814           3 : out_dput:
     815           3 :         rcu_read_unlock();
     816           3 :         dput(dentry);
     817           3 :         return false;
     818             : }
     819             : 
     820      208230 : static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
     821             : {
     822      208230 :         if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
     823       25947 :                 return dentry->d_op->d_revalidate(dentry, flags);
     824             :         else
     825             :                 return 1;
     826             : }
     827             : 
     828             : /**
     829             :  * complete_walk - successful completion of path walk
     830             :  * @nd:  pointer nameidata
     831             :  *
     832             :  * If we had been in RCU mode, drop out of it and legitimize nd->path.
     833             :  * Revalidate the final result, unless we'd already done that during
     834             :  * the path walk or the filesystem doesn't ask for it.  Return 0 on
     835             :  * success, -error on failure.  In case of failure caller does not
     836             :  * need to drop nd->path.
     837             :  */
     838       68114 : static int complete_walk(struct nameidata *nd)
     839             : {
     840       68114 :         struct dentry *dentry = nd->path.dentry;
     841       68114 :         int status;
     842             : 
     843       68114 :         if (nd->flags & LOOKUP_RCU) {
     844             :                 /*
     845             :                  * We don't want to zero nd->root for scoped-lookups or
     846             :                  * externally-managed nd->root.
     847             :                  */
     848       53076 :                 if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
     849       53081 :                         nd->root.mnt = NULL;
     850       53076 :                 nd->flags &= ~LOOKUP_CACHED;
     851       53076 :                 if (!try_to_unlazy(nd))
     852             :                         return -ECHILD;
     853             :         }
     854             : 
     855       68059 :         if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
     856             :                 /*
     857             :                  * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
     858             :                  * ever step outside the root during lookup" and should already
     859             :                  * be guaranteed by the rest of namei, we want to avoid a namei
     860             :                  * BUG resulting in userspace being given a path that was not
     861             :                  * scoped within the root at some point during the lookup.
     862             :                  *
     863             :                  * So, do a final sanity-check to make sure that in the
     864             :                  * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
     865             :                  * we won't silently return an fd completely outside of the
     866             :                  * requested root to userspace.
     867             :                  *
     868             :                  * Userspace could move the path outside the root after this
     869             :                  * check, but as discussed elsewhere this is not a concern (the
     870             :                  * resolved file was inside the root at some point).
     871             :                  */
     872           0 :                 if (!path_is_under(&nd->path, &nd->root))
     873             :                         return -EXDEV;
     874             :         }
     875             : 
     876       68059 :         if (likely(!(nd->flags & LOOKUP_JUMPED)))
     877             :                 return 0;
     878             : 
     879       14338 :         if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
     880             :                 return 0;
     881             : 
     882           0 :         status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
     883           0 :         if (status > 0)
     884             :                 return 0;
     885             : 
     886           0 :         if (!status)
     887           0 :                 status = -ESTALE;
     888             : 
     889             :         return status;
     890             : }
     891             : 
     892       53107 : static int set_root(struct nameidata *nd)
     893             : {
     894       53107 :         struct fs_struct *fs = current->fs;
     895             : 
     896             :         /*
     897             :          * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
     898             :          * still have to ensure it doesn't happen because it will cause a breakout
     899             :          * from the dirfd.
     900             :          */
     901       53107 :         if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
     902             :                 return -ENOTRECOVERABLE;
     903             : 
     904       53107 :         if (nd->flags & LOOKUP_RCU) {
     905       52450 :                 unsigned seq;
     906             : 
     907       52450 :                 do {
     908       52450 :                         seq = read_seqcount_begin(&fs->seq);
     909       52451 :                         nd->root = fs->root;
     910       52451 :                         nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
     911       52455 :                 } while (read_seqcount_retry(&fs->seq, seq));
     912             :         } else {
     913         657 :                 get_fs_root(fs, &nd->root);
     914         657 :                 nd->flags |= LOOKUP_ROOT_GRABBED;
     915             :         }
     916             :         return 0;
     917             : }
     918             : 
     919       54910 : static int nd_jump_root(struct nameidata *nd)
     920             : {
     921       54910 :         if (unlikely(nd->flags & LOOKUP_BENEATH))
     922             :                 return -EXDEV;
     923       54910 :         if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
     924             :                 /* Absolute path arguments to path_init() are allowed. */
     925           0 :                 if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
     926             :                         return -EXDEV;
     927             :         }
     928       54910 :         if (!nd->root.mnt) {
     929       52453 :                 int error = set_root(nd);
     930       52459 :                 if (error)
     931             :                         return error;
     932             :         }
     933       54916 :         if (nd->flags & LOOKUP_RCU) {
     934       54665 :                 struct dentry *d;
     935       54665 :                 nd->path = nd->root;
     936       54665 :                 d = nd->path.dentry;
     937       54665 :                 nd->inode = d->d_inode;
     938       54665 :                 nd->seq = nd->root_seq;
     939       54665 :                 if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
     940             :                         return -ECHILD;
     941             :         } else {
     942         251 :                 path_put(&nd->path);
     943         251 :                 nd->path = nd->root;
     944         251 :                 path_get(&nd->path);
     945         251 :                 nd->inode = nd->path.dentry->d_inode;
     946             :         }
     947       54913 :         nd->flags |= LOOKUP_JUMPED;
     948       54913 :         return 0;
     949             : }
     950             : 
     951             : /*
     952             :  * Helper to directly jump to a known parsed path from ->get_link,
     953             :  * caller must have taken a reference to path beforehand.
     954             :  */
     955         123 : int nd_jump_link(struct path *path)
     956             : {
     957         123 :         int error = -ELOOP;
     958         123 :         struct nameidata *nd = current->nameidata;
     959             : 
     960         123 :         if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
     961           0 :                 goto err;
     962             : 
     963         123 :         error = -EXDEV;
     964         123 :         if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
     965           0 :                 if (nd->path.mnt != path->mnt)
     966           0 :                         goto err;
     967             :         }
     968             :         /* Not currently safe for scoped-lookups. */
     969         123 :         if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
     970           0 :                 goto err;
     971             : 
     972         123 :         path_put(&nd->path);
     973         123 :         nd->path = *path;
     974         123 :         nd->inode = nd->path.dentry->d_inode;
     975         123 :         nd->flags |= LOOKUP_JUMPED;
     976         123 :         return 0;
     977             : 
     978           0 : err:
     979           0 :         path_put(path);
     980           0 :         return error;
     981             : }
     982             : 
     983        8154 : static inline void put_link(struct nameidata *nd)
     984             : {
     985        8154 :         struct saved *last = nd->stack + --nd->depth;
     986        8154 :         do_delayed_call(&last->done);
     987        8152 :         if (!(nd->flags & LOOKUP_RCU))
     988        1859 :                 path_put(&last->link);
     989        8154 : }
     990             : 
     991             : int sysctl_protected_symlinks __read_mostly = 0;
     992             : int sysctl_protected_hardlinks __read_mostly = 0;
     993             : int sysctl_protected_fifos __read_mostly;
     994             : int sysctl_protected_regular __read_mostly;
     995             : 
     996             : /**
     997             :  * may_follow_link - Check symlink following for unsafe situations
     998             :  * @nd: nameidata pathwalk data
     999             :  *
    1000             :  * In the case of the sysctl_protected_symlinks sysctl being enabled,
    1001             :  * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
    1002             :  * in a sticky world-writable directory. This is to protect privileged
    1003             :  * processes from failing races against path names that may change out
    1004             :  * from under them by way of other users creating malicious symlinks.
    1005             :  * It will permit symlinks to be followed only when outside a sticky
    1006             :  * world-writable directory, or when the uid of the symlink and follower
    1007             :  * match, or when the directory owner matches the symlink's owner.
    1008             :  *
    1009             :  * Returns 0 if following the symlink is allowed, -ve on error.
    1010             :  */
    1011        7118 : static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
    1012             : {
    1013        7118 :         struct user_namespace *mnt_userns;
    1014        7118 :         kuid_t i_uid;
    1015             : 
    1016        7118 :         if (!sysctl_protected_symlinks)
    1017             :                 return 0;
    1018             : 
    1019        6421 :         mnt_userns = mnt_user_ns(nd->path.mnt);
    1020        6421 :         i_uid = i_uid_into_mnt(mnt_userns, inode);
    1021             :         /* Allowed if owner and follower match. */
    1022        6421 :         if (uid_eq(current_cred()->fsuid, i_uid))
    1023             :                 return 0;
    1024             : 
    1025             :         /* Allowed if parent directory not sticky and world-writable. */
    1026         679 :         if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
    1027             :                 return 0;
    1028             : 
    1029             :         /* Allowed if parent directory and link owner match. */
    1030           0 :         if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
    1031             :                 return 0;
    1032             : 
    1033           0 :         if (nd->flags & LOOKUP_RCU)
    1034           0 :                 return -ECHILD;
    1035             : 
    1036        7118 :         audit_inode(nd->name, nd->stack[0].link.dentry, 0);
    1037        7118 :         audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
    1038             :         return -EACCES;
    1039             : }
    1040             : 
    1041             : /**
    1042             :  * safe_hardlink_source - Check for safe hardlink conditions
    1043             :  * @mnt_userns: user namespace of the mount the inode was found from
    1044             :  * @inode: the source inode to hardlink from
    1045             :  *
    1046             :  * Return false if at least one of the following conditions:
    1047             :  *    - inode is not a regular file
    1048             :  *    - inode is setuid
    1049             :  *    - inode is setgid and group-exec
    1050             :  *    - access failure for read and write
    1051             :  *
    1052             :  * Otherwise returns true.
    1053             :  */
    1054          26 : static bool safe_hardlink_source(struct user_namespace *mnt_userns,
    1055             :                                  struct inode *inode)
    1056             : {
    1057          26 :         umode_t mode = inode->i_mode;
    1058             : 
    1059             :         /* Special files should not get pinned to the filesystem. */
    1060          26 :         if (!S_ISREG(mode))
    1061             :                 return false;
    1062             : 
    1063             :         /* Setuid files should not get pinned to the filesystem. */
    1064          11 :         if (mode & S_ISUID)
    1065             :                 return false;
    1066             : 
    1067             :         /* Executable setgid files should not get pinned to the filesystem. */
    1068          11 :         if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
    1069             :                 return false;
    1070             : 
    1071             :         /* Hardlinking to unreadable or unwritable sources is dangerous. */
    1072          11 :         if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
    1073           6 :                 return false;
    1074             : 
    1075             :         return true;
    1076             : }
    1077             : 
    1078             : /**
    1079             :  * may_linkat - Check permissions for creating a hardlink
    1080             :  * @mnt_userns: user namespace of the mount the inode was found from
    1081             :  * @link: the source to hardlink from
    1082             :  *
    1083             :  * Block hardlink when all of:
    1084             :  *  - sysctl_protected_hardlinks enabled
    1085             :  *  - fsuid does not match inode
    1086             :  *  - hardlink source is unsafe (see safe_hardlink_source() above)
    1087             :  *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
    1088             :  *
    1089             :  * If the inode has been found through an idmapped mount the user namespace of
    1090             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    1091             :  * care to map the inode according to @mnt_userns before checking permissions.
    1092             :  * On non-idmapped mounts or if permission checking is to be performed on the
    1093             :  * raw inode simply passs init_user_ns.
    1094             :  *
    1095             :  * Returns 0 if successful, -ve on error.
    1096             :  */
    1097          26 : int may_linkat(struct user_namespace *mnt_userns, struct path *link)
    1098             : {
    1099          26 :         struct inode *inode = link->dentry->d_inode;
    1100             : 
    1101             :         /* Inode writeback is not safe when the uid or gid are invalid. */
    1102          26 :         if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
    1103          26 :             !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
    1104             :                 return -EOVERFLOW;
    1105             : 
    1106          26 :         if (!sysctl_protected_hardlinks)
    1107             :                 return 0;
    1108             : 
    1109             :         /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
    1110             :          * otherwise, it must be a safe source.
    1111             :          */
    1112          47 :         if (safe_hardlink_source(mnt_userns, inode) ||
    1113          21 :             inode_owner_or_capable(mnt_userns, inode))
    1114          26 :                 return 0;
    1115             : 
    1116          26 :         audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
    1117             :         return -EPERM;
    1118             : }
    1119             : 
    1120             : /**
    1121             :  * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
    1122             :  *                        should be allowed, or not, on files that already
    1123             :  *                        exist.
    1124             :  * @mnt_userns: user namespace of the mount the inode was found from
    1125             :  * @dir_mode: mode bits of directory
    1126             :  * @dir_uid: owner of directory
    1127             :  * @inode: the inode of the file to open
    1128             :  *
    1129             :  * Block an O_CREAT open of a FIFO (or a regular file) when:
    1130             :  *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
    1131             :  *   - the file already exists
    1132             :  *   - we are in a sticky directory
    1133             :  *   - we don't own the file
    1134             :  *   - the owner of the directory doesn't own the file
    1135             :  *   - the directory is world writable
    1136             :  * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
    1137             :  * the directory doesn't have to be world writable: being group writable will
    1138             :  * be enough.
    1139             :  *
    1140             :  * If the inode has been found through an idmapped mount the user namespace of
    1141             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    1142             :  * care to map the inode according to @mnt_userns before checking permissions.
    1143             :  * On non-idmapped mounts or if permission checking is to be performed on the
    1144             :  * raw inode simply passs init_user_ns.
    1145             :  *
    1146             :  * Returns 0 if the open is allowed, -ve on error.
    1147             :  */
    1148         738 : static int may_create_in_sticky(struct user_namespace *mnt_userns,
    1149             :                                 struct nameidata *nd, struct inode *const inode)
    1150             : {
    1151         738 :         umode_t dir_mode = nd->dir_mode;
    1152         738 :         kuid_t dir_uid = nd->dir_uid;
    1153             : 
    1154         738 :         if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
    1155         735 :             (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
    1156          73 :             likely(!(dir_mode & S_ISVTX)) ||
    1157           0 :             uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
    1158           0 :             uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
    1159         738 :                 return 0;
    1160             : 
    1161           0 :         if (likely(dir_mode & 0002) ||
    1162           0 :             (dir_mode & 0020 &&
    1163           0 :              ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
    1164           0 :               (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
    1165           0 :                 const char *operation = S_ISFIFO(inode->i_mode) ?
    1166             :                                         "sticky_create_fifo" :
    1167             :                                         "sticky_create_regular";
    1168           0 :                 audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
    1169           0 :                 return -EACCES;
    1170             :         }
    1171             :         return 0;
    1172             : }
    1173             : 
    1174             : /*
    1175             :  * follow_up - Find the mountpoint of path's vfsmount
    1176             :  *
    1177             :  * Given a path, find the mountpoint of its source file system.
    1178             :  * Replace @path with the path of the mountpoint in the parent mount.
    1179             :  * Up is towards /.
    1180             :  *
    1181             :  * Return 1 if we went up a level and 0 if we were already at the
    1182             :  * root.
    1183             :  */
    1184         632 : int follow_up(struct path *path)
    1185             : {
    1186         632 :         struct mount *mnt = real_mount(path->mnt);
    1187         632 :         struct mount *parent;
    1188         632 :         struct dentry *mountpoint;
    1189             : 
    1190         632 :         read_seqlock_excl(&mount_lock);
    1191         632 :         parent = mnt->mnt_parent;
    1192         632 :         if (parent == mnt) {
    1193         179 :                 read_sequnlock_excl(&mount_lock);
    1194         179 :                 return 0;
    1195             :         }
    1196         453 :         mntget(&parent->mnt);
    1197         453 :         mountpoint = dget(mnt->mnt_mountpoint);
    1198         453 :         read_sequnlock_excl(&mount_lock);
    1199         453 :         dput(path->dentry);
    1200         453 :         path->dentry = mountpoint;
    1201         453 :         mntput(path->mnt);
    1202         453 :         path->mnt = &parent->mnt;
    1203         453 :         return 1;
    1204             : }
    1205             : EXPORT_SYMBOL(follow_up);
    1206             : 
    1207           0 : static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
    1208             :                                   struct path *path, unsigned *seqp)
    1209             : {
    1210           0 :         while (mnt_has_parent(m)) {
    1211           0 :                 struct dentry *mountpoint = m->mnt_mountpoint;
    1212             : 
    1213           0 :                 m = m->mnt_parent;
    1214           0 :                 if (unlikely(root->dentry == mountpoint &&
    1215             :                              root->mnt == &m->mnt))
    1216             :                         break;
    1217           0 :                 if (mountpoint != m->mnt.mnt_root) {
    1218           0 :                         path->mnt = &m->mnt;
    1219           0 :                         path->dentry = mountpoint;
    1220           0 :                         *seqp = read_seqcount_begin(&mountpoint->d_seq);
    1221           0 :                         return true;
    1222             :                 }
    1223             :         }
    1224             :         return false;
    1225             : }
    1226             : 
    1227           0 : static bool choose_mountpoint(struct mount *m, const struct path *root,
    1228             :                               struct path *path)
    1229             : {
    1230           0 :         bool found;
    1231             : 
    1232           0 :         rcu_read_lock();
    1233           0 :         while (1) {
    1234           0 :                 unsigned seq, mseq = read_seqbegin(&mount_lock);
    1235             : 
    1236           0 :                 found = choose_mountpoint_rcu(m, root, path, &seq);
    1237           0 :                 if (unlikely(!found)) {
    1238           0 :                         if (!read_seqretry(&mount_lock, mseq))
    1239             :                                 break;
    1240             :                 } else {
    1241           0 :                         if (likely(__legitimize_path(path, seq, mseq)))
    1242             :                                 break;
    1243           0 :                         rcu_read_unlock();
    1244           0 :                         path_put(path);
    1245           0 :                         rcu_read_lock();
    1246             :                 }
    1247             :         }
    1248           0 :         rcu_read_unlock();
    1249           0 :         return found;
    1250             : }
    1251             : 
    1252             : /*
    1253             :  * Perform an automount
    1254             :  * - return -EISDIR to tell follow_managed() to stop and return the path we
    1255             :  *   were called with.
    1256             :  */
    1257           0 : static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
    1258             : {
    1259           0 :         struct dentry *dentry = path->dentry;
    1260             : 
    1261             :         /* We don't want to mount if someone's just doing a stat -
    1262             :          * unless they're stat'ing a directory and appended a '/' to
    1263             :          * the name.
    1264             :          *
    1265             :          * We do, however, want to mount if someone wants to open or
    1266             :          * create a file of any type under the mountpoint, wants to
    1267             :          * traverse through the mountpoint or wants to open the
    1268             :          * mounted directory.  Also, autofs may mark negative dentries
    1269             :          * as being automount points.  These will need the attentions
    1270             :          * of the daemon to instantiate them before they can be used.
    1271             :          */
    1272           0 :         if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
    1273           0 :                            LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
    1274           0 :             dentry->d_inode)
    1275             :                 return -EISDIR;
    1276             : 
    1277           0 :         if (count && (*count)++ >= MAXSYMLINKS)
    1278             :                 return -ELOOP;
    1279             : 
    1280           0 :         return finish_automount(dentry->d_op->d_automount(path), path);
    1281             : }
    1282             : 
    1283             : /*
    1284             :  * mount traversal - out-of-line part.  One note on ->d_flags accesses -
    1285             :  * dentries are pinned but not locked here, so negative dentry can go
    1286             :  * positive right under us.  Use of smp_load_acquire() provides a barrier
    1287             :  * sufficient for ->d_inode and ->d_flags consistency.
    1288             :  */
    1289        2897 : static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
    1290             :                              int *count, unsigned lookup_flags)
    1291             : {
    1292        2897 :         struct vfsmount *mnt = path->mnt;
    1293        2897 :         bool need_mntput = false;
    1294        2897 :         int ret = 0;
    1295             : 
    1296        5741 :         while (flags & DCACHE_MANAGED_DENTRY) {
    1297             :                 /* Allow the filesystem to manage the transit without i_mutex
    1298             :                  * being held. */
    1299        2919 :                 if (flags & DCACHE_MANAGE_TRANSIT) {
    1300           0 :                         ret = path->dentry->d_op->d_manage(path, false);
    1301           0 :                         flags = smp_load_acquire(&path->dentry->d_flags);
    1302           0 :                         if (ret < 0)
    1303             :                                 break;
    1304             :                 }
    1305             : 
    1306        2919 :                 if (flags & DCACHE_MOUNTED) {       // something's mounted on it..
    1307        2919 :                         struct vfsmount *mounted = lookup_mnt(path);
    1308        2919 :                         if (mounted) {          // ... in our namespace
    1309        2844 :                                 dput(path->dentry);
    1310        2844 :                                 if (need_mntput)
    1311           0 :                                         mntput(path->mnt);
    1312        2844 :                                 path->mnt = mounted;
    1313        2844 :                                 path->dentry = dget(mounted->mnt_root);
    1314             :                                 // here we know it's positive
    1315        2844 :                                 flags = path->dentry->d_flags;
    1316        2844 :                                 need_mntput = true;
    1317        2844 :                                 continue;
    1318             :                         }
    1319             :                 }
    1320             : 
    1321          75 :                 if (!(flags & DCACHE_NEED_AUTOMOUNT))
    1322             :                         break;
    1323             : 
    1324             :                 // uncovered automount point
    1325           0 :                 ret = follow_automount(path, count, lookup_flags);
    1326           0 :                 flags = smp_load_acquire(&path->dentry->d_flags);
    1327           0 :                 if (ret < 0)
    1328             :                         break;
    1329             :         }
    1330             : 
    1331        2897 :         if (ret == -EISDIR)
    1332           0 :                 ret = 0;
    1333             :         // possible if you race with several mount --move
    1334        2897 :         if (need_mntput && path->mnt == mnt)
    1335           0 :                 mntput(path->mnt);
    1336        2897 :         if (!ret && unlikely(d_flags_negative(flags)))
    1337           0 :                 ret = -ENOENT;
    1338        2897 :         *jumped = need_mntput;
    1339        2897 :         return ret;
    1340             : }
    1341             : 
    1342       45207 : static inline int traverse_mounts(struct path *path, bool *jumped,
    1343             :                                   int *count, unsigned lookup_flags)
    1344             : {
    1345       45207 :         unsigned flags = smp_load_acquire(&path->dentry->d_flags);
    1346             : 
    1347             :         /* fastpath */
    1348       45209 :         if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
    1349       42312 :                 *jumped = false;
    1350       42312 :                 if (unlikely(d_flags_negative(flags)))
    1351             :                         return -ENOENT;
    1352       33771 :                 return 0;
    1353             :         }
    1354        2897 :         return __traverse_mounts(path, flags, jumped, count, lookup_flags);
    1355             : }
    1356             : 
    1357           0 : int follow_down_one(struct path *path)
    1358             : {
    1359           0 :         struct vfsmount *mounted;
    1360             : 
    1361           0 :         mounted = lookup_mnt(path);
    1362           0 :         if (mounted) {
    1363           0 :                 dput(path->dentry);
    1364           0 :                 mntput(path->mnt);
    1365           0 :                 path->mnt = mounted;
    1366           0 :                 path->dentry = dget(mounted->mnt_root);
    1367           0 :                 return 1;
    1368             :         }
    1369             :         return 0;
    1370             : }
    1371             : EXPORT_SYMBOL(follow_down_one);
    1372             : 
    1373             : /*
    1374             :  * Follow down to the covering mount currently visible to userspace.  At each
    1375             :  * point, the filesystem owning that dentry may be queried as to whether the
    1376             :  * caller is permitted to proceed or not.
    1377             :  */
    1378           0 : int follow_down(struct path *path)
    1379             : {
    1380           0 :         struct vfsmount *mnt = path->mnt;
    1381           0 :         bool jumped;
    1382           0 :         int ret = traverse_mounts(path, &jumped, NULL, 0);
    1383             : 
    1384           0 :         if (path->mnt != mnt)
    1385           0 :                 mntput(mnt);
    1386           0 :         return ret;
    1387             : }
    1388             : EXPORT_SYMBOL(follow_down);
    1389             : 
    1390             : /*
    1391             :  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
    1392             :  * we meet a managed dentry that would need blocking.
    1393             :  */
    1394      159460 : static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
    1395             :                                struct inode **inode, unsigned *seqp)
    1396             : {
    1397      159460 :         struct dentry *dentry = path->dentry;
    1398      159460 :         unsigned int flags = dentry->d_flags;
    1399             : 
    1400      159460 :         if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
    1401             :                 return true;
    1402             : 
    1403       29752 :         if (unlikely(nd->flags & LOOKUP_NO_XDEV))
    1404             :                 return false;
    1405             : 
    1406       80904 :         for (;;) {
    1407             :                 /*
    1408             :                  * Don't forget we might have a non-mountpoint managed dentry
    1409             :                  * that wants to block transit.
    1410             :                  */
    1411       55328 :                 if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
    1412           0 :                         int res = dentry->d_op->d_manage(path, true);
    1413           0 :                         if (res)
    1414           0 :                                 return res == -EISDIR;
    1415           0 :                         flags = dentry->d_flags;
    1416             :                 }
    1417             : 
    1418       55328 :                 if (flags & DCACHE_MOUNTED) {
    1419       30839 :                         struct mount *mounted = __lookup_mnt(path->mnt, dentry);
    1420       30841 :                         if (mounted) {
    1421       25577 :                                 path->mnt = &mounted->mnt;
    1422       25577 :                                 dentry = path->dentry = mounted->mnt.mnt_root;
    1423       25577 :                                 nd->flags |= LOOKUP_JUMPED;
    1424       25577 :                                 *seqp = read_seqcount_begin(&dentry->d_seq);
    1425       25576 :                                 *inode = dentry->d_inode;
    1426             :                                 /*
    1427             :                                  * We don't need to re-check ->d_seq after this
    1428             :                                  * ->d_inode read - there will be an RCU delay
    1429             :                                  * between mount hash removal and ->mnt_root
    1430             :                                  * becoming unpinned.
    1431             :                                  */
    1432       25576 :                                 flags = dentry->d_flags;
    1433       25576 :                                 continue;
    1434             :                         }
    1435        5264 :                         if (read_seqretry(&mount_lock, nd->m_seq))
    1436             :                                 return false;
    1437             :                 }
    1438       29750 :                 return !(flags & DCACHE_NEED_AUTOMOUNT);
    1439             :         }
    1440             : }
    1441             : 
    1442      218030 : static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
    1443             :                           struct path *path, struct inode **inode,
    1444             :                           unsigned int *seqp)
    1445             : {
    1446      218030 :         bool jumped;
    1447      218030 :         int ret;
    1448             : 
    1449      218030 :         path->mnt = nd->path.mnt;
    1450      218030 :         path->dentry = dentry;
    1451      218030 :         if (nd->flags & LOOKUP_RCU) {
    1452      172823 :                 unsigned int seq = *seqp;
    1453      172823 :                 if (unlikely(!*inode))
    1454             :                         return -ENOENT;
    1455      159458 :                 if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
    1456             :                         return 0;
    1457           5 :                 if (!try_to_unlazy_next(nd, dentry, seq))
    1458             :                         return -ECHILD;
    1459             :                 // *path might've been clobbered by __follow_mount_rcu()
    1460           0 :                 path->mnt = nd->path.mnt;
    1461           0 :                 path->dentry = dentry;
    1462             :         }
    1463       45207 :         ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
    1464       45208 :         if (jumped) {
    1465        2844 :                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
    1466             :                         ret = -EXDEV;
    1467             :                 else
    1468        2844 :                         nd->flags |= LOOKUP_JUMPED;
    1469             :         }
    1470       45208 :         if (unlikely(ret)) {
    1471        8540 :                 dput(path->dentry);
    1472        8539 :                 if (path->mnt != nd->path.mnt)
    1473           0 :                         mntput(path->mnt);
    1474             :         } else {
    1475       36668 :                 *inode = d_backing_inode(path->dentry);
    1476       36668 :                 *seqp = 0; /* out of RCU mode, so the value doesn't matter */
    1477             :         }
    1478             :         return ret;
    1479             : }
    1480             : 
    1481             : /*
    1482             :  * This looks up the name in dcache and possibly revalidates the found dentry.
    1483             :  * NULL is returned if the dentry does not exist in the cache.
    1484             :  */
    1485       12678 : static struct dentry *lookup_dcache(const struct qstr *name,
    1486             :                                     struct dentry *dir,
    1487             :                                     unsigned int flags)
    1488             : {
    1489       12678 :         struct dentry *dentry = d_lookup(dir, name);
    1490       12678 :         if (dentry) {
    1491        5056 :                 int error = d_revalidate(dentry, flags);
    1492        5056 :                 if (unlikely(error <= 0)) {
    1493           0 :                         if (!error)
    1494           0 :                                 d_invalidate(dentry);
    1495           0 :                         dput(dentry);
    1496           0 :                         return ERR_PTR(error);
    1497             :                 }
    1498             :         }
    1499             :         return dentry;
    1500             : }
    1501             : 
    1502             : /*
    1503             :  * Parent directory has inode locked exclusive.  This is one
    1504             :  * and only case when ->lookup() gets called on non in-lookup
    1505             :  * dentries - as the matter of fact, this only gets called
    1506             :  * when directory is guaranteed to have no in-lookup children
    1507             :  * at all.
    1508             :  */
    1509        7115 : static struct dentry *__lookup_hash(const struct qstr *name,
    1510             :                 struct dentry *base, unsigned int flags)
    1511             : {
    1512        7115 :         struct dentry *dentry = lookup_dcache(name, base, flags);
    1513        7115 :         struct dentry *old;
    1514        7115 :         struct inode *dir = base->d_inode;
    1515             : 
    1516        7115 :         if (dentry)
    1517             :                 return dentry;
    1518             : 
    1519             :         /* Don't create child dentry for a dead directory. */
    1520        2085 :         if (unlikely(IS_DEADDIR(dir)))
    1521        7115 :                 return ERR_PTR(-ENOENT);
    1522             : 
    1523        2085 :         dentry = d_alloc(base, name);
    1524        2085 :         if (unlikely(!dentry))
    1525        7115 :                 return ERR_PTR(-ENOMEM);
    1526             : 
    1527        2085 :         old = dir->i_op->lookup(dir, dentry, flags);
    1528        2085 :         if (unlikely(old)) {
    1529           0 :                 dput(dentry);
    1530           0 :                 dentry = old;
    1531             :         }
    1532             :         return dentry;
    1533             : }
    1534             : 
    1535      216893 : static struct dentry *lookup_fast(struct nameidata *nd,
    1536             :                                   struct inode **inode,
    1537             :                                   unsigned *seqp)
    1538             : {
    1539      216893 :         struct dentry *dentry, *parent = nd->path.dentry;
    1540      216893 :         int status = 1;
    1541             : 
    1542             :         /*
    1543             :          * Rename seqlock is not required here because in the off chance
    1544             :          * of a false negative due to a concurrent rename, the caller is
    1545             :          * going to fall back to non-racy lookup.
    1546             :          */
    1547      216893 :         if (nd->flags & LOOKUP_RCU) {
    1548      186821 :                 unsigned seq;
    1549      186821 :                 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
    1550      186856 :                 if (unlikely(!dentry)) {
    1551       11984 :                         if (!try_to_unlazy(nd))
    1552      184716 :                                 return ERR_PTR(-ECHILD);
    1553       11976 :                         return NULL;
    1554             :                 }
    1555             : 
    1556             :                 /*
    1557             :                  * This sequence count validates that the inode matches
    1558             :                  * the dentry name information from lookup.
    1559             :                  */
    1560      174872 :                 *inode = d_backing_inode(dentry);
    1561      174872 :                 if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
    1562      184716 :                         return ERR_PTR(-ECHILD);
    1563             : 
    1564             :                 /*
    1565             :                  * This sequence count validates that the parent had no
    1566             :                  * changes while we did the lookup of the dentry above.
    1567             :                  *
    1568             :                  * The memory barrier in read_seqcount_begin of child is
    1569             :                  *  enough, we can use __read_seqcount_retry here.
    1570             :                  */
    1571      174867 :                 if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
    1572      184716 :                         return ERR_PTR(-ECHILD);
    1573             : 
    1574      174867 :                 *seqp = seq;
    1575      174867 :                 status = d_revalidate(dentry, nd->flags);
    1576      174872 :                 if (likely(status > 0))
    1577             :                         return dentry;
    1578        2149 :                 if (!try_to_unlazy_next(nd, dentry, seq))
    1579      184716 :                         return ERR_PTR(-ECHILD);
    1580        2138 :                 if (status == -ECHILD)
    1581             :                         /* we'd been told to redo it in non-rcu mode */
    1582        2138 :                         status = d_revalidate(dentry, nd->flags);
    1583             :         } else {
    1584       30072 :                 dentry = __d_lookup(parent, &nd->last);
    1585       30071 :                 if (unlikely(!dentry))
    1586             :                         return NULL;
    1587       26066 :                 status = d_revalidate(dentry, nd->flags);
    1588             :         }
    1589       28206 :         if (unlikely(status <= 0)) {
    1590           0 :                 if (!status)
    1591           0 :                         d_invalidate(dentry);
    1592           0 :                 dput(dentry);
    1593           0 :                 return ERR_PTR(status);
    1594             :         }
    1595             :         return dentry;
    1596             : }
    1597             : 
    1598             : /* Fast lookup failed, do it the slow way */
    1599       18799 : static struct dentry *__lookup_slow(const struct qstr *name,
    1600             :                                     struct dentry *dir,
    1601             :                                     unsigned int flags)
    1602             : {
    1603       18799 :         struct dentry *dentry, *old;
    1604       18799 :         struct inode *inode = dir->d_inode;
    1605       18799 :         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
    1606             : 
    1607             :         /* Don't go there if it's already dead */
    1608       18799 :         if (unlikely(IS_DEADDIR(inode)))
    1609       18799 :                 return ERR_PTR(-ENOENT);
    1610       18799 : again:
    1611       18799 :         dentry = d_alloc_parallel(dir, name, &wq);
    1612       18798 :         if (IS_ERR(dentry))
    1613           0 :                 return dentry;
    1614       18798 :         if (unlikely(!d_in_lookup(dentry))) {
    1615           4 :                 int error = d_revalidate(dentry, flags);
    1616           4 :                 if (unlikely(error <= 0)) {
    1617           0 :                         if (!error) {
    1618           0 :                                 d_invalidate(dentry);
    1619           0 :                                 dput(dentry);
    1620           0 :                                 goto again;
    1621             :                         }
    1622           0 :                         dput(dentry);
    1623           0 :                         dentry = ERR_PTR(error);
    1624             :                 }
    1625             :         } else {
    1626       18794 :                 old = inode->i_op->lookup(inode, dentry, flags);
    1627       18795 :                 d_lookup_done(dentry);
    1628       18795 :                 if (unlikely(old)) {
    1629         113 :                         dput(dentry);
    1630         113 :                         dentry = old;
    1631             :                 }
    1632             :         }
    1633             :         return dentry;
    1634             : }
    1635             : 
    1636       13269 : static struct dentry *lookup_slow(const struct qstr *name,
    1637             :                                   struct dentry *dir,
    1638             :                                   unsigned int flags)
    1639             : {
    1640       13269 :         struct inode *inode = dir->d_inode;
    1641       13269 :         struct dentry *res;
    1642       13269 :         inode_lock_shared(inode);
    1643       13270 :         res = __lookup_slow(name, dir, flags);
    1644       13269 :         inode_unlock_shared(inode);
    1645       13270 :         return res;
    1646             : }
    1647             : 
    1648      226338 : static inline int may_lookup(struct user_namespace *mnt_userns,
    1649             :                              struct nameidata *nd)
    1650             : {
    1651      226338 :         if (nd->flags & LOOKUP_RCU) {
    1652      204537 :                 int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
    1653      204539 :                 if (err != -ECHILD || !try_to_unlazy(nd))
    1654      195049 :                         return err;
    1655             :         }
    1656       31291 :         return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
    1657             : }
    1658             : 
    1659        8159 : static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
    1660             : {
    1661        8159 :         if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
    1662             :                 return -ELOOP;
    1663             : 
    1664        8159 :         if (likely(nd->depth != EMBEDDED_LEVELS))
    1665             :                 return 0;
    1666           0 :         if (likely(nd->stack != nd->internal))
    1667             :                 return 0;
    1668           0 :         if (likely(nd_alloc_stack(nd)))
    1669             :                 return 0;
    1670             : 
    1671           0 :         if (nd->flags & LOOKUP_RCU) {
    1672             :                 // we need to grab link before we do unlazy.  And we can't skip
    1673             :                 // unlazy even if we fail to grab the link - cleanup needs it
    1674           0 :                 bool grabbed_link = legitimize_path(nd, link, seq);
    1675             : 
    1676           0 :                 if (!try_to_unlazy(nd) != 0 || !grabbed_link)
    1677             :                         return -ECHILD;
    1678             : 
    1679           0 :                 if (nd_alloc_stack(nd))
    1680           0 :                         return 0;
    1681             :         }
    1682             :         return -ENOMEM;
    1683             : }
    1684             : 
    1685             : enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
    1686             : 
    1687        8159 : static const char *pick_link(struct nameidata *nd, struct path *link,
    1688             :                      struct inode *inode, unsigned seq, int flags)
    1689             : {
    1690        8159 :         struct saved *last;
    1691        8159 :         const char *res;
    1692        8159 :         int error = reserve_stack(nd, link, seq);
    1693             : 
    1694        8159 :         if (unlikely(error)) {
    1695           0 :                 if (!(nd->flags & LOOKUP_RCU))
    1696           0 :                         path_put(link);
    1697           0 :                 return ERR_PTR(error);
    1698             :         }
    1699        8159 :         last = nd->stack + nd->depth++;
    1700        8159 :         last->link = *link;
    1701        8159 :         clear_delayed_call(&last->done);
    1702        8159 :         last->seq = seq;
    1703             : 
    1704        8159 :         if (flags & WALK_TRAILING) {
    1705        7118 :                 error = may_follow_link(nd, inode);
    1706        7118 :                 if (unlikely(error))
    1707           0 :                         return ERR_PTR(error);
    1708             :         }
    1709             : 
    1710        8159 :         if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
    1711        8159 :                         unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
    1712        8161 :                 return ERR_PTR(-ELOOP);
    1713             : 
    1714        8159 :         if (!(nd->flags & LOOKUP_RCU)) {
    1715        1205 :                 touch_atime(&last->link);
    1716        1205 :                 cond_resched();
    1717        6954 :         } else if (atime_needs_update(&last->link, inode)) {
    1718          20 :                 if (!try_to_unlazy(nd))
    1719        8161 :                         return ERR_PTR(-ECHILD);
    1720          20 :                 touch_atime(&last->link);
    1721             :         }
    1722             : 
    1723       16316 :         error = security_inode_follow_link(link->dentry, inode,
    1724        8159 :                                            nd->flags & LOOKUP_RCU);
    1725        8157 :         if (unlikely(error))
    1726           0 :                 return ERR_PTR(error);
    1727             : 
    1728        8157 :         res = READ_ONCE(inode->i_link);
    1729        8157 :         if (!res) {
    1730        1022 :                 const char * (*get)(struct dentry *, struct inode *,
    1731             :                                 struct delayed_call *);
    1732        1022 :                 get = inode->i_op->get_link;
    1733        1022 :                 if (nd->flags & LOOKUP_RCU) {
    1734         639 :                         res = get(NULL, inode, &last->done);
    1735         642 :                         if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
    1736           0 :                                 res = get(link->dentry, inode, &last->done);
    1737             :                 } else {
    1738         383 :                         res = get(link->dentry, inode, &last->done);
    1739             :                 }
    1740        1026 :                 if (!res)
    1741         123 :                         goto all_done;
    1742         903 :                 if (IS_ERR(res))
    1743             :                         return res;
    1744             :         }
    1745        8038 :         if (*res == '/') {
    1746        2457 :                 error = nd_jump_root(nd);
    1747        2457 :                 if (unlikely(error))
    1748           0 :                         return ERR_PTR(error);
    1749        2457 :                 while (unlikely(*++res == '/'))
    1750        2457 :                         ;
    1751             :         }
    1752        8038 :         if (*res)
    1753             :                 return res;
    1754           0 : all_done: // pure jump
    1755         123 :         put_link(nd);
    1756         123 :         return NULL;
    1757             : }
    1758             : 
    1759             : /*
    1760             :  * Do we need to follow links? We _really_ want to be able
    1761             :  * to do this check without having to look at inode->i_op,
    1762             :  * so we keep a cache of "no, this doesn't need follow_link"
    1763             :  * for the common case.
    1764             :  */
    1765      218022 : static const char *step_into(struct nameidata *nd, int flags,
    1766             :                      struct dentry *dentry, struct inode *inode, unsigned seq)
    1767             : {
    1768      218022 :         struct path path;
    1769      218022 :         int err = handle_mounts(nd, dentry, &path, &inode, &seq);
    1770             : 
    1771      218022 :         if (err < 0)
    1772       21909 :                 return ERR_PTR(err);
    1773      196113 :         if (likely(!d_is_symlink(path.dentry)) ||
    1774        9969 :            ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
    1775        8159 :            (flags & WALK_NOFOLLOW)) {
    1776             :                 /* not a symlink or should not follow */
    1777      187954 :                 if (!(nd->flags & LOOKUP_RCU)) {
    1778       35463 :                         dput(nd->path.dentry);
    1779       35461 :                         if (nd->path.mnt != path.mnt)
    1780        2844 :                                 mntput(nd->path.mnt);
    1781             :                 }
    1782      187952 :                 nd->path = path;
    1783      187952 :                 nd->inode = inode;
    1784      187952 :                 nd->seq = seq;
    1785      187952 :                 return NULL;
    1786             :         }
    1787        8159 :         if (nd->flags & LOOKUP_RCU) {
    1788             :                 /* make sure that d_is_symlink above matches inode */
    1789        6954 :                 if (read_seqcount_retry(&path.dentry->d_seq, seq))
    1790      218021 :                         return ERR_PTR(-ECHILD);
    1791             :         } else {
    1792        1205 :                 if (path.mnt == nd->path.mnt)
    1793        1205 :                         mntget(path.mnt);
    1794             :         }
    1795        8161 :         return pick_link(nd, &path, inode, seq, flags);
    1796             : }
    1797             : 
    1798         107 : static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
    1799             :                                         struct inode **inodep,
    1800             :                                         unsigned *seqp)
    1801             : {
    1802         107 :         struct dentry *parent, *old;
    1803             : 
    1804         107 :         if (path_equal(&nd->path, &nd->root))
    1805           4 :                 goto in_root;
    1806         103 :         if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
    1807           0 :                 struct path path;
    1808           0 :                 unsigned seq;
    1809           0 :                 if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
    1810           0 :                                            &nd->root, &path, &seq))
    1811           0 :                         goto in_root;
    1812           0 :                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
    1813           0 :                         return ERR_PTR(-ECHILD);
    1814           0 :                 nd->path = path;
    1815           0 :                 nd->inode = path.dentry->d_inode;
    1816           0 :                 nd->seq = seq;
    1817           0 :                 if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
    1818           0 :                         return ERR_PTR(-ECHILD);
    1819             :                 /* we know that mountpoint was pinned */
    1820             :         }
    1821         103 :         old = nd->path.dentry;
    1822         103 :         parent = old->d_parent;
    1823         103 :         *inodep = parent->d_inode;
    1824         103 :         *seqp = read_seqcount_begin(&parent->d_seq);
    1825         103 :         if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
    1826         107 :                 return ERR_PTR(-ECHILD);
    1827         103 :         if (unlikely(!path_connected(nd->path.mnt, parent)))
    1828           0 :                 return ERR_PTR(-ECHILD);
    1829             :         return parent;
    1830           4 : in_root:
    1831           4 :         if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
    1832         107 :                 return ERR_PTR(-ECHILD);
    1833           4 :         if (unlikely(nd->flags & LOOKUP_BENEATH))
    1834           0 :                 return ERR_PTR(-ECHILD);
    1835             :         return NULL;
    1836             : }
    1837             : 
    1838        1060 : static struct dentry *follow_dotdot(struct nameidata *nd,
    1839             :                                  struct inode **inodep,
    1840             :                                  unsigned *seqp)
    1841             : {
    1842        1060 :         struct dentry *parent;
    1843             : 
    1844        1060 :         if (path_equal(&nd->path, &nd->root))
    1845           0 :                 goto in_root;
    1846        1060 :         if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
    1847           0 :                 struct path path;
    1848             : 
    1849           0 :                 if (!choose_mountpoint(real_mount(nd->path.mnt),
    1850           0 :                                        &nd->root, &path))
    1851           0 :                         goto in_root;
    1852           0 :                 path_put(&nd->path);
    1853           0 :                 nd->path = path;
    1854           0 :                 nd->inode = path.dentry->d_inode;
    1855           0 :                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
    1856           0 :                         return ERR_PTR(-EXDEV);
    1857             :         }
    1858             :         /* rare case of legitimate dget_parent()... */
    1859        1060 :         parent = dget_parent(nd->path.dentry);
    1860        1060 :         if (unlikely(!path_connected(nd->path.mnt, parent))) {
    1861           0 :                 dput(parent);
    1862           0 :                 return ERR_PTR(-ENOENT);
    1863             :         }
    1864        1060 :         *seqp = 0;
    1865        1060 :         *inodep = parent->d_inode;
    1866        1060 :         return parent;
    1867             : 
    1868           0 : in_root:
    1869           0 :         if (unlikely(nd->flags & LOOKUP_BENEATH))
    1870        1060 :                 return ERR_PTR(-EXDEV);
    1871           0 :         dget(nd->path.dentry);
    1872             :         return NULL;
    1873             : }
    1874             : 
    1875        8858 : static const char *handle_dots(struct nameidata *nd, int type)
    1876             : {
    1877        8858 :         if (type == LAST_DOTDOT) {
    1878        1167 :                 const char *error = NULL;
    1879        1167 :                 struct dentry *parent;
    1880        1167 :                 struct inode *inode;
    1881        1167 :                 unsigned seq;
    1882             : 
    1883        1167 :                 if (!nd->root.mnt) {
    1884         653 :                         error = ERR_PTR(set_root(nd));
    1885         653 :                         if (error)
    1886           0 :                                 return error;
    1887             :                 }
    1888        1167 :                 if (nd->flags & LOOKUP_RCU)
    1889         107 :                         parent = follow_dotdot_rcu(nd, &inode, &seq);
    1890             :                 else
    1891        1060 :                         parent = follow_dotdot(nd, &inode, &seq);
    1892        1167 :                 if (IS_ERR(parent))
    1893           0 :                         return ERR_CAST(parent);
    1894        1167 :                 if (unlikely(!parent))
    1895           4 :                         error = step_into(nd, WALK_NOFOLLOW,
    1896             :                                          nd->path.dentry, nd->inode, nd->seq);
    1897             :                 else
    1898        1163 :                         error = step_into(nd, WALK_NOFOLLOW,
    1899             :                                          parent, inode, seq);
    1900        1167 :                 if (unlikely(error))
    1901             :                         return error;
    1902             : 
    1903        1167 :                 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
    1904             :                         /*
    1905             :                          * If there was a racing rename or mount along our
    1906             :                          * path, then we can't be sure that ".." hasn't jumped
    1907             :                          * above nd->root (and so userspace should retry or use
    1908             :                          * some fallback).
    1909             :                          */
    1910           0 :                         smp_rmb();
    1911           0 :                         if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
    1912           0 :                                 return ERR_PTR(-EAGAIN);
    1913           0 :                         if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
    1914           0 :                                 return ERR_PTR(-EAGAIN);
    1915             :                 }
    1916             :         }
    1917             :         return NULL;
    1918             : }
    1919             : 
    1920      201314 : static const char *walk_component(struct nameidata *nd, int flags)
    1921             : {
    1922      201314 :         struct dentry *dentry;
    1923      201314 :         struct inode *inode;
    1924      201314 :         unsigned seq;
    1925             :         /*
    1926             :          * "." and ".." are special - ".." especially so because it has
    1927             :          * to be able to know about the current root directory and
    1928             :          * parent relationships.
    1929             :          */
    1930      201314 :         if (unlikely(nd->last_type != LAST_NORM)) {
    1931        8821 :                 if (!(flags & WALK_MORE) && nd->depth)
    1932           0 :                         put_link(nd);
    1933        8821 :                 return handle_dots(nd, nd->last_type);
    1934             :         }
    1935      192493 :         dentry = lookup_fast(nd, &inode, &seq);
    1936      192521 :         if (IS_ERR(dentry))
    1937      201318 :                 return ERR_CAST(dentry);
    1938      192506 :         if (unlikely(!dentry)) {
    1939       13261 :                 dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
    1940       13261 :                 if (IS_ERR(dentry))
    1941      201318 :                         return ERR_CAST(dentry);
    1942             :         }
    1943      192393 :         if (!(flags & WALK_MORE) && nd->depth)
    1944        2360 :                 put_link(nd);
    1945      192393 :         return step_into(nd, flags, dentry, inode, seq);
    1946             : }
    1947             : 
    1948             : /*
    1949             :  * We can do the critical dentry name comparison and hashing
    1950             :  * operations one word at a time, but we are limited to:
    1951             :  *
    1952             :  * - Architectures with fast unaligned word accesses. We could
    1953             :  *   do a "get_unaligned()" if this helps and is sufficiently
    1954             :  *   fast.
    1955             :  *
    1956             :  * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
    1957             :  *   do not trap on the (extremely unlikely) case of a page
    1958             :  *   crossing operation.
    1959             :  *
    1960             :  * - Furthermore, we need an efficient 64-bit compile for the
    1961             :  *   64-bit case in order to generate the "number of bytes in
    1962             :  *   the final mask". Again, that could be replaced with a
    1963             :  *   efficient population count instruction or similar.
    1964             :  */
    1965             : #ifdef CONFIG_DCACHE_WORD_ACCESS
    1966             : 
    1967             : #include <asm/word-at-a-time.h>
    1968             : 
    1969             : #ifdef HASH_MIX
    1970             : 
    1971             : /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
    1972             : 
    1973             : #elif defined(CONFIG_64BIT)
    1974             : /*
    1975             :  * Register pressure in the mixing function is an issue, particularly
    1976             :  * on 32-bit x86, but almost any function requires one state value and
    1977             :  * one temporary.  Instead, use a function designed for two state values
    1978             :  * and no temporaries.
    1979             :  *
    1980             :  * This function cannot create a collision in only two iterations, so
    1981             :  * we have two iterations to achieve avalanche.  In those two iterations,
    1982             :  * we have six layers of mixing, which is enough to spread one bit's
    1983             :  * influence out to 2^6 = 64 state bits.
    1984             :  *
    1985             :  * Rotate constants are scored by considering either 64 one-bit input
    1986             :  * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
    1987             :  * probability of that delta causing a change to each of the 128 output
    1988             :  * bits, using a sample of random initial states.
    1989             :  *
    1990             :  * The Shannon entropy of the computed probabilities is then summed
    1991             :  * to produce a score.  Ideally, any input change has a 50% chance of
    1992             :  * toggling any given output bit.
    1993             :  *
    1994             :  * Mixing scores (in bits) for (12,45):
    1995             :  * Input delta: 1-bit      2-bit
    1996             :  * 1 round:     713.3    42542.6
    1997             :  * 2 rounds:   2753.7   140389.8
    1998             :  * 3 rounds:   5954.1   233458.2
    1999             :  * 4 rounds:   7862.6   256672.2
    2000             :  * Perfect:    8192     258048
    2001             :  *            (64*128) (64*63/2 * 128)
    2002             :  */
    2003             : #define HASH_MIX(x, y, a)       \
    2004             :         (       x ^= (a),       \
    2005             :         y ^= x, x = rol64(x,12),\
    2006             :         x += y, y = rol64(y,45),\
    2007             :         y *= 9                  )
    2008             : 
    2009             : /*
    2010             :  * Fold two longs into one 32-bit hash value.  This must be fast, but
    2011             :  * latency isn't quite as critical, as there is a fair bit of additional
    2012             :  * work done before the hash value is used.
    2013             :  */
    2014      234798 : static inline unsigned int fold_hash(unsigned long x, unsigned long y)
    2015             : {
    2016      234798 :         y ^= x * GOLDEN_RATIO_64;
    2017      234798 :         y *= GOLDEN_RATIO_64;
    2018      234798 :         return y >> 32;
    2019             : }
    2020             : 
    2021             : #else   /* 32-bit case */
    2022             : 
    2023             : /*
    2024             :  * Mixing scores (in bits) for (7,20):
    2025             :  * Input delta: 1-bit      2-bit
    2026             :  * 1 round:     330.3     9201.6
    2027             :  * 2 rounds:   1246.4    25475.4
    2028             :  * 3 rounds:   1907.1    31295.1
    2029             :  * 4 rounds:   2042.3    31718.6
    2030             :  * Perfect:    2048      31744
    2031             :  *            (32*64)   (32*31/2 * 64)
    2032             :  */
    2033             : #define HASH_MIX(x, y, a)       \
    2034             :         (       x ^= (a),       \
    2035             :         y ^= x, x = rol32(x, 7),\
    2036             :         x += y, y = rol32(y,20),\
    2037             :         y *= 9                  )
    2038             : 
    2039             : static inline unsigned int fold_hash(unsigned long x, unsigned long y)
    2040             : {
    2041             :         /* Use arch-optimized multiply if one exists */
    2042             :         return __hash_32(y ^ __hash_32(x));
    2043             : }
    2044             : 
    2045             : #endif
    2046             : 
    2047             : /*
    2048             :  * Return the hash of a string of known length.  This is carfully
    2049             :  * designed to match hash_name(), which is the more critical function.
    2050             :  * In particular, we must end by hashing a final word containing 0..7
    2051             :  * payload bytes, to match the way that hash_name() iterates until it
    2052             :  * finds the delimiter after the name.
    2053             :  */
    2054        8448 : unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
    2055             : {
    2056        8448 :         unsigned long a, x = 0, y = (unsigned long)salt;
    2057             : 
    2058       15226 :         for (;;) {
    2059       11837 :                 if (!len)
    2060         327 :                         goto done;
    2061       11510 :                 a = load_unaligned_zeropad(name);
    2062       11510 :                 if (len < sizeof(unsigned long))
    2063             :                         break;
    2064        3389 :                 HASH_MIX(x, y, a);
    2065        3389 :                 name += sizeof(unsigned long);
    2066        3389 :                 len -= sizeof(unsigned long);
    2067             :         }
    2068        8121 :         x ^= a & bytemask_from_count(len);
    2069        8448 : done:
    2070        8448 :         return fold_hash(x, y);
    2071             : }
    2072             : EXPORT_SYMBOL(full_name_hash);
    2073             : 
    2074             : /* Return the "hash_len" (hash and length) of a null-terminated string */
    2075           5 : u64 hashlen_string(const void *salt, const char *name)
    2076             : {
    2077           5 :         unsigned long a = 0, x = 0, y = (unsigned long)salt;
    2078           5 :         unsigned long adata, mask, len;
    2079           5 :         const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
    2080             : 
    2081           5 :         len = 0;
    2082           5 :         goto inside;
    2083             : 
    2084           2 :         do {
    2085           2 :                 HASH_MIX(x, y, a);
    2086           2 :                 len += sizeof(unsigned long);
    2087           7 : inside:
    2088           7 :                 a = load_unaligned_zeropad(name+len);
    2089           7 :         } while (!has_zero(a, &adata, &constants));
    2090             : 
    2091           5 :         adata = prep_zero_mask(a, adata, &constants);
    2092           5 :         mask = create_zero_mask(adata);
    2093           5 :         x ^= a & zero_bytemask(mask);
    2094             : 
    2095           5 :         return hashlen_create(fold_hash(x, y), len + find_zero(mask));
    2096             : }
    2097             : EXPORT_SYMBOL(hashlen_string);
    2098             : 
    2099             : /*
    2100             :  * Calculate the length and hash of the path component, and
    2101             :  * return the "hash_len" as the result.
    2102             :  */
    2103      226345 : static inline u64 hash_name(const void *salt, const char *name)
    2104             : {
    2105      226345 :         unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
    2106      226345 :         unsigned long adata, bdata, mask, len;
    2107      226345 :         const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
    2108             : 
    2109      226345 :         len = 0;
    2110      226345 :         goto inside;
    2111             : 
    2112       80349 :         do {
    2113       80349 :                 HASH_MIX(x, y, a);
    2114       80349 :                 len += sizeof(unsigned long);
    2115      306694 : inside:
    2116      306694 :                 a = load_unaligned_zeropad(name+len);
    2117      306694 :                 b = a ^ REPEAT_BYTE('/');
    2118      306694 :         } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
    2119             : 
    2120      226345 :         adata = prep_zero_mask(a, adata, &constants);
    2121      226345 :         bdata = prep_zero_mask(b, bdata, &constants);
    2122      226345 :         mask = create_zero_mask(adata | bdata);
    2123      226345 :         x ^= a & zero_bytemask(mask);
    2124             : 
    2125      226345 :         return hashlen_create(fold_hash(x, y), len + find_zero(mask));
    2126             : }
    2127             : 
    2128             : #else   /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
    2129             : 
    2130             : /* Return the hash of a string of known length */
    2131             : unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
    2132             : {
    2133             :         unsigned long hash = init_name_hash(salt);
    2134             :         while (len--)
    2135             :                 hash = partial_name_hash((unsigned char)*name++, hash);
    2136             :         return end_name_hash(hash);
    2137             : }
    2138             : EXPORT_SYMBOL(full_name_hash);
    2139             : 
    2140             : /* Return the "hash_len" (hash and length) of a null-terminated string */
    2141             : u64 hashlen_string(const void *salt, const char *name)
    2142             : {
    2143             :         unsigned long hash = init_name_hash(salt);
    2144             :         unsigned long len = 0, c;
    2145             : 
    2146             :         c = (unsigned char)*name;
    2147             :         while (c) {
    2148             :                 len++;
    2149             :                 hash = partial_name_hash(c, hash);
    2150             :                 c = (unsigned char)name[len];
    2151             :         }
    2152             :         return hashlen_create(end_name_hash(hash), len);
    2153             : }
    2154             : EXPORT_SYMBOL(hashlen_string);
    2155             : 
    2156             : /*
    2157             :  * We know there's a real path component here of at least
    2158             :  * one character.
    2159             :  */
    2160             : static inline u64 hash_name(const void *salt, const char *name)
    2161             : {
    2162             :         unsigned long hash = init_name_hash(salt);
    2163             :         unsigned long len = 0, c;
    2164             : 
    2165             :         c = (unsigned char)*name;
    2166             :         do {
    2167             :                 len++;
    2168             :                 hash = partial_name_hash(c, hash);
    2169             :                 c = (unsigned char)name[len];
    2170             :         } while (c && c != '/');
    2171             :         return hashlen_create(end_name_hash(hash), len);
    2172             : }
    2173             : 
    2174             : #endif
    2175             : 
    2176             : /*
    2177             :  * Name resolution.
    2178             :  * This is the basic name resolution function, turning a pathname into
    2179             :  * the final dentry. We expect 'base' to be positive and a directory.
    2180             :  *
    2181             :  * Returns 0 and nd will have valid dentry and mnt on success.
    2182             :  * Returns error and drops reference to input namei data on failure.
    2183             :  */
    2184      114110 : static int link_path_walk(const char *name, struct nameidata *nd)
    2185             : {
    2186      114110 :         int depth = 0; // depth <= nd->depth
    2187      114110 :         int err;
    2188             : 
    2189      114110 :         nd->last_type = LAST_ROOT;
    2190      114110 :         nd->flags |= LOOKUP_PARENT;
    2191      114110 :         if (IS_ERR(name))
    2192       16185 :                 return PTR_ERR(name);
    2193      150375 :         while (*name=='/')
    2194       52450 :                 name++;
    2195       97925 :         if (!*name) {
    2196        7317 :                 nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
    2197        7317 :                 return 0;
    2198             :         }
    2199             : 
    2200             :         /* At this point we know we have a real path component. */
    2201      226332 :         for(;;) {
    2202      226332 :                 struct user_namespace *mnt_userns;
    2203      226332 :                 const char *link;
    2204      226332 :                 u64 hash_len;
    2205      226332 :                 int type;
    2206             : 
    2207      226332 :                 mnt_userns = mnt_user_ns(nd->path.mnt);
    2208      226338 :                 err = may_lookup(mnt_userns, nd);
    2209      226355 :                 if (err)
    2210           9 :                         return err;
    2211             : 
    2212      226346 :                 hash_len = hash_name(nd->path.dentry, name);
    2213             : 
    2214      226350 :                 type = LAST_NORM;
    2215      226350 :                 if (name[0] == '.') switch (hashlen_len(hash_len)) {
    2216        1167 :                         case 2:
    2217        1167 :                                 if (name[1] == '.') {
    2218        1167 :                                         type = LAST_DOTDOT;
    2219        1167 :                                         nd->flags |= LOOKUP_JUMPED;
    2220             :                                 }
    2221             :                                 break;
    2222         377 :                         case 1:
    2223         377 :                                 type = LAST_DOT;
    2224             :                 }
    2225      226350 :                 if (likely(type == LAST_NORM)) {
    2226      224806 :                         struct dentry *parent = nd->path.dentry;
    2227      224806 :                         nd->flags &= ~LOOKUP_JUMPED;
    2228      224806 :                         if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
    2229           0 :                                 struct qstr this = { { .hash_len = hash_len }, .name = name };
    2230           0 :                                 err = parent->d_op->d_hash(parent, &this);
    2231           0 :                                 if (err < 0)
    2232           0 :                                         return err;
    2233           0 :                                 hash_len = this.hash_len;
    2234           0 :                                 name = this.name;
    2235             :                         }
    2236             :                 }
    2237             : 
    2238      226350 :                 nd->last.hash_len = hash_len;
    2239      226350 :                 nd->last.name = name;
    2240      226350 :                 nd->last_type = type;
    2241             : 
    2242      226350 :                 name += hashlen_len(hash_len);
    2243      226350 :                 if (!*name)
    2244       85329 :                         goto OK;
    2245             :                 /*
    2246             :                  * If it wasn't NUL, we know it was '/'. Skip that
    2247             :                  * slash, and continue until no more slashes.
    2248             :                  */
    2249      141301 :                 do {
    2250      141301 :                         name++;
    2251      141301 :                 } while (unlikely(*name == '/'));
    2252      141021 :                 if (unlikely(!*name)) {
    2253         316 : OK:
    2254             :                         /* pathname or trailing symlink, done */
    2255       85645 :                         if (!depth) {
    2256       84604 :                                 nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
    2257       84604 :                                 nd->dir_mode = nd->inode->i_mode;
    2258       84604 :                                 nd->flags &= ~LOOKUP_PARENT;
    2259       84604 :                                 return 0;
    2260             :                         }
    2261             :                         /* last component of nested symlink */
    2262        1041 :                         name = nd->stack[--depth].name;
    2263        1041 :                         link = walk_component(nd, 0);
    2264             :                 } else {
    2265             :                         /* not the last component */
    2266      140705 :                         link = walk_component(nd, WALK_MORE);
    2267             :                 }
    2268      141742 :                 if (unlikely(link)) {
    2269        7061 :                         if (IS_ERR(link))
    2270        6018 :                                 return PTR_ERR(link);
    2271             :                         /* a symlink to follow */
    2272        1043 :                         nd->stack[depth++].name = name;
    2273        1043 :                         name = link;
    2274        1043 :                         continue;
    2275             :                 }
    2276      134681 :                 if (unlikely(!d_can_lookup(nd->path.dentry))) {
    2277           0 :                         if (nd->flags & LOOKUP_RCU) {
    2278           0 :                                 if (!try_to_unlazy(nd))
    2279             :                                         return -ECHILD;
    2280             :                         }
    2281           0 :                         return -ENOTDIR;
    2282             :                 }
    2283             :         }
    2284             : }
    2285             : 
    2286             : /* must be paired with terminate_walk() */
    2287       90956 : static const char *path_init(struct nameidata *nd, unsigned flags)
    2288             : {
    2289       90956 :         int error;
    2290       90956 :         const char *s = nd->name->name;
    2291             : 
    2292             :         /* LOOKUP_CACHED requires RCU, ask caller to retry */
    2293       90956 :         if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
    2294       90956 :                 return ERR_PTR(-EAGAIN);
    2295             : 
    2296       90956 :         if (!*s)
    2297          34 :                 flags &= ~LOOKUP_RCU;
    2298       90956 :         if (flags & LOOKUP_RCU)
    2299       90838 :                 rcu_read_lock();
    2300             : 
    2301       90954 :         nd->flags = flags | LOOKUP_JUMPED;
    2302       90954 :         nd->depth = 0;
    2303             : 
    2304      413543 :         nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
    2305      132856 :         nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
    2306       90948 :         smp_rmb();
    2307             : 
    2308       90949 :         if (flags & LOOKUP_ROOT) {
    2309           0 :                 struct dentry *root = nd->root.dentry;
    2310           0 :                 struct inode *inode = root->d_inode;
    2311           0 :                 if (*s && unlikely(!d_can_lookup(root)))
    2312       90956 :                         return ERR_PTR(-ENOTDIR);
    2313           0 :                 nd->path = nd->root;
    2314           0 :                 nd->inode = inode;
    2315           0 :                 if (flags & LOOKUP_RCU) {
    2316           0 :                         nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
    2317           0 :                         nd->root_seq = nd->seq;
    2318             :                 } else {
    2319           0 :                         path_get(&nd->path);
    2320             :                 }
    2321           0 :                 return s;
    2322             :         }
    2323             : 
    2324       90949 :         nd->root.mnt = NULL;
    2325       90949 :         nd->path.mnt = NULL;
    2326       90949 :         nd->path.dentry = NULL;
    2327             : 
    2328             :         /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
    2329       90949 :         if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
    2330       52454 :                 error = nd_jump_root(nd);
    2331       52457 :                 if (unlikely(error))
    2332           0 :                         return ERR_PTR(error);
    2333             :                 return s;
    2334             :         }
    2335             : 
    2336             :         /* Relative pathname -- get the starting-point it is relative to. */
    2337       38495 :         if (nd->dfd == AT_FDCWD) {
    2338        7710 :                 if (flags & LOOKUP_RCU) {
    2339        7710 :                         struct fs_struct *fs = current->fs;
    2340        7710 :                         unsigned seq;
    2341             : 
    2342        7710 :                         do {
    2343        7710 :                                 seq = read_seqcount_begin(&fs->seq);
    2344        7710 :                                 nd->path = fs->pwd;
    2345        7710 :                                 nd->inode = nd->path.dentry->d_inode;
    2346        7710 :                                 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
    2347        7710 :                         } while (read_seqcount_retry(&fs->seq, seq));
    2348             :                 } else {
    2349           0 :                         get_fs_pwd(current->fs, &nd->path);
    2350           0 :                         nd->inode = nd->path.dentry->d_inode;
    2351             :                 }
    2352             :         } else {
    2353             :                 /* Caller must check execute permissions on the starting path component */
    2354       30785 :                 struct fd f = fdget_raw(nd->dfd);
    2355       30792 :                 struct dentry *dentry;
    2356             : 
    2357       30792 :                 if (!f.file)
    2358       90956 :                         return ERR_PTR(-EBADF);
    2359             : 
    2360       30792 :                 dentry = f.file->f_path.dentry;
    2361             : 
    2362       30792 :                 if (*s && unlikely(!d_can_lookup(dentry))) {
    2363           0 :                         fdput(f);
    2364           0 :                         return ERR_PTR(-ENOTDIR);
    2365             :                 }
    2366             : 
    2367       30792 :                 nd->path = f.file->f_path;
    2368       30792 :                 if (flags & LOOKUP_RCU) {
    2369       30713 :                         nd->inode = nd->path.dentry->d_inode;
    2370       30713 :                         nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
    2371             :                 } else {
    2372          79 :                         path_get(&nd->path);
    2373          79 :                         nd->inode = nd->path.dentry->d_inode;
    2374             :                 }
    2375       30799 :                 fdput(f);
    2376             :         }
    2377             : 
    2378             :         /* For scoped-lookups we need to set the root to the dirfd as well. */
    2379       38499 :         if (flags & LOOKUP_IS_SCOPED) {
    2380           0 :                 nd->root = nd->path;
    2381           0 :                 if (flags & LOOKUP_RCU) {
    2382           0 :                         nd->root_seq = nd->seq;
    2383             :                 } else {
    2384           0 :                         path_get(&nd->root);
    2385           0 :                         nd->flags |= LOOKUP_ROOT_GRABBED;
    2386             :                 }
    2387             :         }
    2388             :         return s;
    2389             : }
    2390             : 
    2391       59591 : static inline const char *lookup_last(struct nameidata *nd)
    2392             : {
    2393       59591 :         if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
    2394         179 :                 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
    2395             : 
    2396       59591 :         return walk_component(nd, WALK_TRAILING);
    2397             : }
    2398             : 
    2399         114 : static int handle_lookup_down(struct nameidata *nd)
    2400             : {
    2401         114 :         if (!(nd->flags & LOOKUP_RCU))
    2402         114 :                 dget(nd->path.dentry);
    2403         114 :         return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
    2404             :                         nd->path.dentry, nd->inode, nd->seq));
    2405             : }
    2406             : 
    2407             : /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
    2408       59125 : static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
    2409             : {
    2410       59125 :         const char *s = path_init(nd, flags);
    2411       59123 :         int err;
    2412             : 
    2413       59123 :         if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
    2414           0 :                 err = handle_lookup_down(nd);
    2415           0 :                 if (unlikely(err < 0))
    2416           0 :                         s = ERR_PTR(err);
    2417             :         }
    2418             : 
    2419       72706 :         while (!(err = link_path_walk(s, nd)) &&
    2420       59593 :                (s = lookup_last(nd)) != NULL)
    2421       72713 :                 ;
    2422       59122 :         if (!err)
    2423       46010 :                 err = complete_walk(nd);
    2424             : 
    2425       59120 :         if (!err && nd->flags & LOOKUP_DIRECTORY)
    2426        1610 :                 if (!d_can_lookup(nd->path.dentry))
    2427             :                         err = -ENOTDIR;
    2428       59120 :         if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
    2429         114 :                 err = handle_lookup_down(nd);
    2430         114 :                 nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
    2431             :         }
    2432       59120 :         if (!err) {
    2433       45962 :                 *path = nd->path;
    2434       45962 :                 nd->path.mnt = NULL;
    2435       45962 :                 nd->path.dentry = NULL;
    2436             :         }
    2437       59120 :         terminate_walk(nd);
    2438       59119 :         return err;
    2439             : }
    2440             : 
    2441       22782 : int filename_lookup(int dfd, struct filename *name, unsigned flags,
    2442             :                     struct path *path, struct path *root)
    2443             : {
    2444       22782 :         int retval;
    2445       22782 :         struct nameidata nd;
    2446       22782 :         if (IS_ERR(name))
    2447           0 :                 return PTR_ERR(name);
    2448       22782 :         if (unlikely(root)) {
    2449           0 :                 nd.root = *root;
    2450           0 :                 flags |= LOOKUP_ROOT;
    2451             :         }
    2452       22782 :         set_nameidata(&nd, dfd, name);
    2453       22782 :         retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
    2454       22784 :         if (unlikely(retval == -ECHILD))
    2455           9 :                 retval = path_lookupat(&nd, flags, path);
    2456       22784 :         if (unlikely(retval == -ESTALE))
    2457           0 :                 retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
    2458             : 
    2459       22784 :         if (likely(!retval))
    2460       22784 :                 audit_inode(name, path->dentry,
    2461             :                             flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
    2462       22784 :         restore_nameidata();
    2463       22785 :         putname(name);
    2464       22785 :         return retval;
    2465             : }
    2466             : 
    2467             : /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
    2468        7860 : static int path_parentat(struct nameidata *nd, unsigned flags,
    2469             :                                 struct path *parent)
    2470             : {
    2471        7860 :         const char *s = path_init(nd, flags);
    2472        7860 :         int err = link_path_walk(s, nd);
    2473        7860 :         if (!err)
    2474        7129 :                 err = complete_walk(nd);
    2475        7860 :         if (!err) {
    2476        7129 :                 *parent = nd->path;
    2477        7129 :                 nd->path.mnt = NULL;
    2478        7129 :                 nd->path.dentry = NULL;
    2479             :         }
    2480        7860 :         terminate_walk(nd);
    2481        7860 :         return err;
    2482             : }
    2483             : 
    2484        7860 : static struct filename *filename_parentat(int dfd, struct filename *name,
    2485             :                                 unsigned int flags, struct path *parent,
    2486             :                                 struct qstr *last, int *type)
    2487             : {
    2488        7860 :         int retval;
    2489        7860 :         struct nameidata nd;
    2490             : 
    2491        7860 :         if (IS_ERR(name))
    2492             :                 return name;
    2493        7860 :         set_nameidata(&nd, dfd, name);
    2494        7860 :         retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
    2495        7860 :         if (unlikely(retval == -ECHILD))
    2496           0 :                 retval = path_parentat(&nd, flags, parent);
    2497        7860 :         if (unlikely(retval == -ESTALE))
    2498           0 :                 retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
    2499        7860 :         if (likely(!retval)) {
    2500        7129 :                 *last = nd.last;
    2501        7129 :                 *type = nd.last_type;
    2502        7129 :                 audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
    2503             :         } else {
    2504         731 :                 putname(name);
    2505         731 :                 name = ERR_PTR(retval);
    2506             :         }
    2507        7860 :         restore_nameidata();
    2508        7860 :         return name;
    2509             : }
    2510             : 
    2511             : /* does lookup, returns the object with parent locked */
    2512           0 : struct dentry *kern_path_locked(const char *name, struct path *path)
    2513             : {
    2514           0 :         struct filename *filename;
    2515           0 :         struct dentry *d;
    2516           0 :         struct qstr last;
    2517           0 :         int type;
    2518             : 
    2519           0 :         filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
    2520             :                                     &last, &type);
    2521           0 :         if (IS_ERR(filename))
    2522           0 :                 return ERR_CAST(filename);
    2523           0 :         if (unlikely(type != LAST_NORM)) {
    2524           0 :                 path_put(path);
    2525           0 :                 putname(filename);
    2526           0 :                 return ERR_PTR(-EINVAL);
    2527             :         }
    2528           0 :         inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
    2529           0 :         d = __lookup_hash(&last, path->dentry, 0);
    2530           0 :         if (IS_ERR(d)) {
    2531           0 :                 inode_unlock(path->dentry->d_inode);
    2532           0 :                 path_put(path);
    2533             :         }
    2534           0 :         putname(filename);
    2535           0 :         return d;
    2536             : }
    2537             : 
    2538         668 : int kern_path(const char *name, unsigned int flags, struct path *path)
    2539             : {
    2540         668 :         return filename_lookup(AT_FDCWD, getname_kernel(name),
    2541             :                                flags, path, NULL);
    2542             : }
    2543             : EXPORT_SYMBOL(kern_path);
    2544             : 
    2545             : /**
    2546             :  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
    2547             :  * @dentry:  pointer to dentry of the base directory
    2548             :  * @mnt: pointer to vfs mount of the base directory
    2549             :  * @name: pointer to file name
    2550             :  * @flags: lookup flags
    2551             :  * @path: pointer to struct path to fill
    2552             :  */
    2553           0 : int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
    2554             :                     const char *name, unsigned int flags,
    2555             :                     struct path *path)
    2556             : {
    2557           0 :         struct path root = {.mnt = mnt, .dentry = dentry};
    2558             :         /* the first argument of filename_lookup() is ignored with root */
    2559           0 :         return filename_lookup(AT_FDCWD, getname_kernel(name),
    2560             :                                flags , path, &root);
    2561             : }
    2562             : EXPORT_SYMBOL(vfs_path_lookup);
    2563             : 
    2564        5563 : static int lookup_one_len_common(const char *name, struct dentry *base,
    2565             :                                  int len, struct qstr *this)
    2566             : {
    2567        5563 :         this->name = name;
    2568        5563 :         this->len = len;
    2569        5563 :         this->hash = full_name_hash(base, name, len);
    2570        5563 :         if (!len)
    2571             :                 return -EACCES;
    2572             : 
    2573        5563 :         if (unlikely(name[0] == '.')) {
    2574           0 :                 if (len < 2 || (len == 2 && name[1] == '.'))
    2575             :                         return -EACCES;
    2576             :         }
    2577             : 
    2578       57153 :         while (len--) {
    2579       51590 :                 unsigned int c = *(const unsigned char *)name++;
    2580       51590 :                 if (c == '/' || c == '\0')
    2581             :                         return -EACCES;
    2582             :         }
    2583             :         /*
    2584             :          * See if the low-level filesystem might want
    2585             :          * to use its own hash..
    2586             :          */
    2587        5563 :         if (base->d_flags & DCACHE_OP_HASH) {
    2588           0 :                 int err = base->d_op->d_hash(base, this);
    2589           0 :                 if (err < 0)
    2590             :                         return err;
    2591             :         }
    2592             : 
    2593        5563 :         return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
    2594             : }
    2595             : 
    2596             : /**
    2597             :  * try_lookup_one_len - filesystem helper to lookup single pathname component
    2598             :  * @name:       pathname component to lookup
    2599             :  * @base:       base directory to lookup from
    2600             :  * @len:        maximum length @len should be interpreted to
    2601             :  *
    2602             :  * Look up a dentry by name in the dcache, returning NULL if it does not
    2603             :  * currently exist.  The function does not try to create a dentry.
    2604             :  *
    2605             :  * Note that this routine is purely a helper for filesystem usage and should
    2606             :  * not be called by generic code.
    2607             :  *
    2608             :  * The caller must hold base->i_mutex.
    2609             :  */
    2610           0 : struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
    2611             : {
    2612           0 :         struct qstr this;
    2613           0 :         int err;
    2614             : 
    2615           0 :         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
    2616             : 
    2617           0 :         err = lookup_one_len_common(name, base, len, &this);
    2618           0 :         if (err)
    2619           0 :                 return ERR_PTR(err);
    2620             : 
    2621           0 :         return lookup_dcache(&this, base, 0);
    2622             : }
    2623             : EXPORT_SYMBOL(try_lookup_one_len);
    2624             : 
    2625             : /**
    2626             :  * lookup_one_len - filesystem helper to lookup single pathname component
    2627             :  * @name:       pathname component to lookup
    2628             :  * @base:       base directory to lookup from
    2629             :  * @len:        maximum length @len should be interpreted to
    2630             :  *
    2631             :  * Note that this routine is purely a helper for filesystem usage and should
    2632             :  * not be called by generic code.
    2633             :  *
    2634             :  * The caller must hold base->i_mutex.
    2635             :  */
    2636        5531 : struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
    2637             : {
    2638        5531 :         struct dentry *dentry;
    2639        5531 :         struct qstr this;
    2640        5531 :         int err;
    2641             : 
    2642        5531 :         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
    2643             : 
    2644        5531 :         err = lookup_one_len_common(name, base, len, &this);
    2645        5531 :         if (err)
    2646           0 :                 return ERR_PTR(err);
    2647             : 
    2648        5531 :         dentry = lookup_dcache(&this, base, 0);
    2649        5531 :         return dentry ? dentry : __lookup_slow(&this, base, 0);
    2650             : }
    2651             : EXPORT_SYMBOL(lookup_one_len);
    2652             : 
    2653             : /**
    2654             :  * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
    2655             :  * @name:       pathname component to lookup
    2656             :  * @base:       base directory to lookup from
    2657             :  * @len:        maximum length @len should be interpreted to
    2658             :  *
    2659             :  * Note that this routine is purely a helper for filesystem usage and should
    2660             :  * not be called by generic code.
    2661             :  *
    2662             :  * Unlike lookup_one_len, it should be called without the parent
    2663             :  * i_mutex held, and will take the i_mutex itself if necessary.
    2664             :  */
    2665          32 : struct dentry *lookup_one_len_unlocked(const char *name,
    2666             :                                        struct dentry *base, int len)
    2667             : {
    2668          32 :         struct qstr this;
    2669          32 :         int err;
    2670          32 :         struct dentry *ret;
    2671             : 
    2672          32 :         err = lookup_one_len_common(name, base, len, &this);
    2673          32 :         if (err)
    2674           0 :                 return ERR_PTR(err);
    2675             : 
    2676          32 :         ret = lookup_dcache(&this, base, 0);
    2677          32 :         if (!ret)
    2678           8 :                 ret = lookup_slow(&this, base, 0);
    2679             :         return ret;
    2680             : }
    2681             : EXPORT_SYMBOL(lookup_one_len_unlocked);
    2682             : 
    2683             : /*
    2684             :  * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
    2685             :  * on negatives.  Returns known positive or ERR_PTR(); that's what
    2686             :  * most of the users want.  Note that pinned negative with unlocked parent
    2687             :  * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
    2688             :  * need to be very careful; pinned positives have ->d_inode stable, so
    2689             :  * this one avoids such problems.
    2690             :  */
    2691           0 : struct dentry *lookup_positive_unlocked(const char *name,
    2692             :                                        struct dentry *base, int len)
    2693             : {
    2694           0 :         struct dentry *ret = lookup_one_len_unlocked(name, base, len);
    2695           0 :         if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
    2696           0 :                 dput(ret);
    2697           0 :                 ret = ERR_PTR(-ENOENT);
    2698             :         }
    2699           0 :         return ret;
    2700             : }
    2701             : EXPORT_SYMBOL(lookup_positive_unlocked);
    2702             : 
    2703             : #ifdef CONFIG_UNIX98_PTYS
    2704           0 : int path_pts(struct path *path)
    2705             : {
    2706             :         /* Find something mounted on "pts" in the same directory as
    2707             :          * the input path.
    2708             :          */
    2709           0 :         struct dentry *parent = dget_parent(path->dentry);
    2710           0 :         struct dentry *child;
    2711           0 :         struct qstr this = QSTR_INIT("pts", 3);
    2712             : 
    2713           0 :         if (unlikely(!path_connected(path->mnt, parent))) {
    2714           0 :                 dput(parent);
    2715           0 :                 return -ENOENT;
    2716             :         }
    2717           0 :         dput(path->dentry);
    2718           0 :         path->dentry = parent;
    2719           0 :         child = d_hash_and_lookup(parent, &this);
    2720           0 :         if (!child)
    2721             :                 return -ENOENT;
    2722             : 
    2723           0 :         path->dentry = child;
    2724           0 :         dput(parent);
    2725           0 :         follow_down(path);
    2726           0 :         return 0;
    2727             : }
    2728             : #endif
    2729             : 
    2730       22116 : int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
    2731             :                  struct path *path, int *empty)
    2732             : {
    2733       22116 :         return filename_lookup(dfd, getname_flags(name, flags, empty),
    2734             :                                flags, path, NULL);
    2735             : }
    2736             : EXPORT_SYMBOL(user_path_at_empty);
    2737             : 
    2738          21 : int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
    2739             :                    struct inode *inode)
    2740             : {
    2741          21 :         kuid_t fsuid = current_fsuid();
    2742             : 
    2743          21 :         if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
    2744             :                 return 0;
    2745           0 :         if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
    2746             :                 return 0;
    2747           0 :         return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
    2748             : }
    2749             : EXPORT_SYMBOL(__check_sticky);
    2750             : 
    2751             : /*
    2752             :  *      Check whether we can remove a link victim from directory dir, check
    2753             :  *  whether the type of victim is right.
    2754             :  *  1. We can't do it if dir is read-only (done in permission())
    2755             :  *  2. We should have write and exec permissions on dir
    2756             :  *  3. We can't remove anything from append-only dir
    2757             :  *  4. We can't do anything with immutable dir (done in permission())
    2758             :  *  5. If the sticky bit on dir is set we should either
    2759             :  *      a. be owner of dir, or
    2760             :  *      b. be owner of victim, or
    2761             :  *      c. have CAP_FOWNER capability
    2762             :  *  6. If the victim is append-only or immutable we can't do antyhing with
    2763             :  *     links pointing to it.
    2764             :  *  7. If the victim has an unknown uid or gid we can't change the inode.
    2765             :  *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
    2766             :  *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
    2767             :  * 10. We can't remove a root or mountpoint.
    2768             :  * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
    2769             :  *     nfs_async_unlink().
    2770             :  */
    2771        2349 : static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
    2772             :                       struct dentry *victim, bool isdir)
    2773             : {
    2774        2349 :         struct inode *inode = d_backing_inode(victim);
    2775        2349 :         int error;
    2776             : 
    2777        2349 :         if (d_is_negative(victim))
    2778             :                 return -ENOENT;
    2779        2349 :         BUG_ON(!inode);
    2780             : 
    2781        2349 :         BUG_ON(victim->d_parent->d_inode != dir);
    2782             : 
    2783             :         /* Inode writeback is not safe when the uid or gid are invalid. */
    2784        2349 :         if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
    2785        2349 :             !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
    2786             :                 return -EOVERFLOW;
    2787             : 
    2788        2349 :         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
    2789             : 
    2790        2349 :         error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
    2791        2349 :         if (error)
    2792             :                 return error;
    2793        2349 :         if (IS_APPEND(dir))
    2794             :                 return -EPERM;
    2795             : 
    2796        2349 :         if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
    2797        2349 :             IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
    2798        2349 :             HAS_UNMAPPED_ID(mnt_userns, inode))
    2799             :                 return -EPERM;
    2800        2349 :         if (isdir) {
    2801        1072 :                 if (!d_is_dir(victim))
    2802             :                         return -ENOTDIR;
    2803        1072 :                 if (IS_ROOT(victim))
    2804             :                         return -EBUSY;
    2805        2412 :         } else if (d_is_dir(victim))
    2806             :                 return -EISDIR;
    2807        2207 :         if (IS_DEADDIR(dir))
    2808             :                 return -ENOENT;
    2809        2207 :         if (victim->d_flags & DCACHE_NFSFS_RENAMED)
    2810           0 :                 return -EBUSY;
    2811             :         return 0;
    2812             : }
    2813             : 
    2814             : /*      Check whether we can create an object with dentry child in directory
    2815             :  *  dir.
    2816             :  *  1. We can't do it if child already exists (open has special treatment for
    2817             :  *     this case, but since we are inlined it's OK)
    2818             :  *  2. We can't do it if dir is read-only (done in permission())
    2819             :  *  3. We can't do it if the fs can't represent the fsuid or fsgid.
    2820             :  *  4. We should have write and exec permissions on dir
    2821             :  *  5. We can't do it if dir is immutable (done in permission())
    2822             :  */
    2823        1952 : static inline int may_create(struct user_namespace *mnt_userns,
    2824             :                              struct inode *dir, struct dentry *child)
    2825             : {
    2826        1952 :         struct user_namespace *s_user_ns;
    2827        1952 :         audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
    2828        1952 :         if (child->d_inode)
    2829             :                 return -EEXIST;
    2830        1952 :         if (IS_DEADDIR(dir))
    2831             :                 return -ENOENT;
    2832        1952 :         s_user_ns = dir->i_sb->s_user_ns;
    2833        3904 :         if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
    2834        1952 :             !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
    2835           0 :                 return -EOVERFLOW;
    2836        1952 :         return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
    2837             : }
    2838             : 
    2839             : /*
    2840             :  * p1 and p2 should be directories on the same fs.
    2841             :  */
    2842         428 : struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
    2843             : {
    2844         428 :         struct dentry *p;
    2845             : 
    2846         428 :         if (p1 == p2) {
    2847         414 :                 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
    2848         414 :                 return NULL;
    2849             :         }
    2850             : 
    2851          14 :         mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
    2852             : 
    2853          14 :         p = d_ancestor(p2, p1);
    2854          14 :         if (p) {
    2855           0 :                 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
    2856           0 :                 inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
    2857           0 :                 return p;
    2858             :         }
    2859             : 
    2860          14 :         p = d_ancestor(p1, p2);
    2861          14 :         if (p) {
    2862           0 :                 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
    2863           0 :                 inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
    2864           0 :                 return p;
    2865             :         }
    2866             : 
    2867          14 :         inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
    2868          14 :         inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
    2869          14 :         return NULL;
    2870             : }
    2871             : EXPORT_SYMBOL(lock_rename);
    2872             : 
    2873         428 : void unlock_rename(struct dentry *p1, struct dentry *p2)
    2874             : {
    2875         428 :         inode_unlock(p1->d_inode);
    2876         428 :         if (p1 != p2) {
    2877          14 :                 inode_unlock(p2->d_inode);
    2878          14 :                 mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
    2879             :         }
    2880         428 : }
    2881             : EXPORT_SYMBOL(unlock_rename);
    2882             : 
    2883             : /**
    2884             :  * vfs_create - create new file
    2885             :  * @mnt_userns: user namespace of the mount the inode was found from
    2886             :  * @dir:        inode of @dentry
    2887             :  * @dentry:     pointer to dentry of the base directory
    2888             :  * @mode:       mode of the new file
    2889             :  * @want_excl:  whether the file must not yet exist
    2890             :  *
    2891             :  * Create a new file.
    2892             :  *
    2893             :  * If the inode has been found through an idmapped mount the user namespace of
    2894             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    2895             :  * care to map the inode according to @mnt_userns before checking permissions.
    2896             :  * On non-idmapped mounts or if permission checking is to be performed on the
    2897             :  * raw inode simply passs init_user_ns.
    2898             :  */
    2899         471 : int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
    2900             :                struct dentry *dentry, umode_t mode, bool want_excl)
    2901             : {
    2902         471 :         int error = may_create(mnt_userns, dir, dentry);
    2903         471 :         if (error)
    2904             :                 return error;
    2905             : 
    2906         471 :         if (!dir->i_op->create)
    2907             :                 return -EACCES; /* shouldn't it be ENOSYS? */
    2908         471 :         mode &= S_IALLUGO;
    2909         471 :         mode |= S_IFREG;
    2910         471 :         error = security_inode_create(dir, dentry, mode);
    2911         471 :         if (error)
    2912             :                 return error;
    2913         471 :         error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
    2914         471 :         if (!error)
    2915         471 :                 fsnotify_create(dir, dentry);
    2916             :         return error;
    2917             : }
    2918             : EXPORT_SYMBOL(vfs_create);
    2919             : 
    2920           0 : int vfs_mkobj(struct dentry *dentry, umode_t mode,
    2921             :                 int (*f)(struct dentry *, umode_t, void *),
    2922             :                 void *arg)
    2923             : {
    2924           0 :         struct inode *dir = dentry->d_parent->d_inode;
    2925           0 :         int error = may_create(&init_user_ns, dir, dentry);
    2926           0 :         if (error)
    2927             :                 return error;
    2928             : 
    2929           0 :         mode &= S_IALLUGO;
    2930           0 :         mode |= S_IFREG;
    2931           0 :         error = security_inode_create(dir, dentry, mode);
    2932           0 :         if (error)
    2933             :                 return error;
    2934           0 :         error = f(dentry, mode, arg);
    2935           0 :         if (!error)
    2936           0 :                 fsnotify_create(dir, dentry);
    2937             :         return error;
    2938             : }
    2939             : EXPORT_SYMBOL(vfs_mkobj);
    2940             : 
    2941         442 : bool may_open_dev(const struct path *path)
    2942             : {
    2943         442 :         return !(path->mnt->mnt_flags & MNT_NODEV) &&
    2944         442 :                 !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
    2945             : }
    2946             : 
    2947       15593 : static int may_open(struct user_namespace *mnt_userns, const struct path *path,
    2948             :                     int acc_mode, int flag)
    2949             : {
    2950       15593 :         struct dentry *dentry = path->dentry;
    2951       15593 :         struct inode *inode = dentry->d_inode;
    2952       15593 :         int error;
    2953             : 
    2954       15593 :         if (!inode)
    2955             :                 return -ENOENT;
    2956             : 
    2957       15593 :         switch (inode->i_mode & S_IFMT) {
    2958             :         case S_IFLNK:
    2959             :                 return -ELOOP;
    2960        1388 :         case S_IFDIR:
    2961        1388 :                 if (acc_mode & MAY_WRITE)
    2962             :                         return -EISDIR;
    2963        1388 :                 if (acc_mode & MAY_EXEC)
    2964             :                         return -EACCES;
    2965             :                 break;
    2966         440 :         case S_IFBLK:
    2967             :         case S_IFCHR:
    2968         440 :                 if (!may_open_dev(path))
    2969             :                         return -EACCES;
    2970         454 :                 fallthrough;
    2971             :         case S_IFIFO:
    2972             :         case S_IFSOCK:
    2973         454 :                 if (acc_mode & MAY_EXEC)
    2974             :                         return -EACCES;
    2975         454 :                 flag &= ~O_TRUNC;
    2976         454 :                 break;
    2977       13740 :         case S_IFREG:
    2978       13740 :                 if ((acc_mode & MAY_EXEC) && path_noexec(path))
    2979             :                         return -EACCES;
    2980             :                 break;
    2981             :         }
    2982             : 
    2983       15582 :         error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
    2984       15583 :         if (error)
    2985             :                 return error;
    2986             : 
    2987             :         /*
    2988             :          * An append-only file must be opened in append mode for writing.
    2989             :          */
    2990       15581 :         if (IS_APPEND(inode)) {
    2991           0 :                 if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
    2992             :                         return -EPERM;
    2993           0 :                 if (flag & O_TRUNC)
    2994             :                         return -EPERM;
    2995             :         }
    2996             : 
    2997             :         /* O_NOATIME can only be set by the owner or superuser */
    2998       15581 :         if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
    2999           0 :                 return -EPERM;
    3000             : 
    3001             :         return 0;
    3002             : }
    3003             : 
    3004          10 : static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
    3005             : {
    3006          10 :         const struct path *path = &filp->f_path;
    3007          10 :         struct inode *inode = path->dentry->d_inode;
    3008          10 :         int error = get_write_access(inode);
    3009          10 :         if (error)
    3010             :                 return error;
    3011             :         /*
    3012             :          * Refuse to truncate files with mandatory locks held on them.
    3013             :          */
    3014          10 :         error = locks_verify_locked(filp);
    3015          10 :         if (!error)
    3016          10 :                 error = security_path_truncate(path);
    3017          10 :         if (!error) {
    3018          10 :                 error = do_truncate(mnt_userns, path->dentry, 0,
    3019             :                                     ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
    3020             :                                     filp);
    3021             :         }
    3022          10 :         put_write_access(inode);
    3023          10 :         return error;
    3024             : }
    3025             : 
    3026           0 : static inline int open_to_namei_flags(int flag)
    3027             : {
    3028           0 :         if ((flag & O_ACCMODE) == 3)
    3029           0 :                 flag--;
    3030           0 :         return flag;
    3031             : }
    3032             : 
    3033         635 : static int may_o_create(struct user_namespace *mnt_userns,
    3034             :                         const struct path *dir, struct dentry *dentry,
    3035             :                         umode_t mode)
    3036             : {
    3037         635 :         struct user_namespace *s_user_ns;
    3038         635 :         int error = security_path_mknod(dir, dentry, mode, 0);
    3039         635 :         if (error)
    3040             :                 return error;
    3041             : 
    3042         635 :         s_user_ns = dir->dentry->d_sb->s_user_ns;
    3043        1270 :         if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
    3044         635 :             !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
    3045           0 :                 return -EOVERFLOW;
    3046             : 
    3047         635 :         error = inode_permission(mnt_userns, dir->dentry->d_inode,
    3048             :                                  MAY_WRITE | MAY_EXEC);
    3049         635 :         if (error)
    3050             :                 return error;
    3051             : 
    3052         631 :         return security_inode_create(dir->dentry->d_inode, dentry, mode);
    3053             : }
    3054             : 
    3055             : /*
    3056             :  * Attempt to atomically look up, create and open a file from a negative
    3057             :  * dentry.
    3058             :  *
    3059             :  * Returns 0 if successful.  The file will have been created and attached to
    3060             :  * @file by the filesystem calling finish_open().
    3061             :  *
    3062             :  * If the file was looked up only or didn't need creating, FMODE_OPENED won't
    3063             :  * be set.  The caller will need to perform the open themselves.  @path will
    3064             :  * have been updated to point to the new dentry.  This may be negative.
    3065             :  *
    3066             :  * Returns an error code otherwise.
    3067             :  */
    3068           0 : static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
    3069             :                                   struct file *file,
    3070             :                                   int open_flag, umode_t mode)
    3071             : {
    3072           0 :         struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
    3073           0 :         struct inode *dir =  nd->path.dentry->d_inode;
    3074           0 :         int error;
    3075             : 
    3076           0 :         if (nd->flags & LOOKUP_DIRECTORY)
    3077           0 :                 open_flag |= O_DIRECTORY;
    3078             : 
    3079           0 :         file->f_path.dentry = DENTRY_NOT_SET;
    3080           0 :         file->f_path.mnt = nd->path.mnt;
    3081           0 :         error = dir->i_op->atomic_open(dir, dentry, file,
    3082           0 :                                        open_to_namei_flags(open_flag), mode);
    3083           0 :         d_lookup_done(dentry);
    3084           0 :         if (!error) {
    3085           0 :                 if (file->f_mode & FMODE_OPENED) {
    3086           0 :                         if (unlikely(dentry != file->f_path.dentry)) {
    3087           0 :                                 dput(dentry);
    3088           0 :                                 dentry = dget(file->f_path.dentry);
    3089             :                         }
    3090           0 :                 } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
    3091             :                         error = -EIO;
    3092             :                 } else {
    3093           0 :                         if (file->f_path.dentry) {
    3094           0 :                                 dput(dentry);
    3095           0 :                                 dentry = file->f_path.dentry;
    3096             :                         }
    3097           0 :                         if (unlikely(d_is_negative(dentry)))
    3098             :                                 error = -ENOENT;
    3099             :                 }
    3100             :         }
    3101           0 :         if (error) {
    3102           0 :                 dput(dentry);
    3103           0 :                 dentry = ERR_PTR(error);
    3104             :         }
    3105           0 :         return dentry;
    3106             : }
    3107             : 
    3108             : /*
    3109             :  * Look up and maybe create and open the last component.
    3110             :  *
    3111             :  * Must be called with parent locked (exclusive in O_CREAT case).
    3112             :  *
    3113             :  * Returns 0 on success, that is, if
    3114             :  *  the file was successfully atomically created (if necessary) and opened, or
    3115             :  *  the file was not completely opened at this time, though lookups and
    3116             :  *  creations were performed.
    3117             :  * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
    3118             :  * In the latter case dentry returned in @path might be negative if O_CREAT
    3119             :  * hadn't been specified.
    3120             :  *
    3121             :  * An error code is returned on failure.
    3122             :  */
    3123        3465 : static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
    3124             :                                   const struct open_flags *op,
    3125             :                                   bool got_write)
    3126             : {
    3127        3465 :         struct user_namespace *mnt_userns;
    3128        3465 :         struct dentry *dir = nd->path.dentry;
    3129        3465 :         struct inode *dir_inode = dir->d_inode;
    3130        3465 :         int open_flag = op->open_flag;
    3131        3465 :         struct dentry *dentry;
    3132        3465 :         int error, create_error = 0;
    3133        3465 :         umode_t mode = op->mode;
    3134        3465 :         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
    3135             : 
    3136        3465 :         if (unlikely(IS_DEADDIR(dir_inode)))
    3137        3465 :                 return ERR_PTR(-ENOENT);
    3138             : 
    3139        3465 :         file->f_mode &= ~FMODE_CREATED;
    3140        3465 :         dentry = d_lookup(dir, &nd->last);
    3141        3465 :         for (;;) {
    3142        3465 :                 if (!dentry) {
    3143        3351 :                         dentry = d_alloc_parallel(dir, &nd->last, &wq);
    3144        3351 :                         if (IS_ERR(dentry))
    3145             :                                 return dentry;
    3146             :                 }
    3147        3465 :                 if (d_in_lookup(dentry))
    3148             :                         break;
    3149             : 
    3150         116 :                 error = d_revalidate(dentry, nd->flags);
    3151         116 :                 if (likely(error > 0))
    3152             :                         break;
    3153           0 :                 if (error)
    3154           0 :                         goto out_dput;
    3155           0 :                 d_invalidate(dentry);
    3156           0 :                 dput(dentry);
    3157           0 :                 dentry = NULL;
    3158             :         }
    3159        3465 :         if (dentry->d_inode) {
    3160             :                 /* Cached positive dentry: will open in f_op->open */
    3161             :                 return dentry;
    3162             :         }
    3163             : 
    3164             :         /*
    3165             :          * Checking write permission is tricky, bacuse we don't know if we are
    3166             :          * going to actually need it: O_CREAT opens should work as long as the
    3167             :          * file exists.  But checking existence breaks atomicity.  The trick is
    3168             :          * to check access and if not granted clear O_CREAT from the flags.
    3169             :          *
    3170             :          * Another problem is returing the "right" error value (e.g. for an
    3171             :          * O_EXCL open we want to return EEXIST not EROFS).
    3172             :          */
    3173        3356 :         if (unlikely(!got_write))
    3174        2562 :                 open_flag &= ~O_TRUNC;
    3175        3356 :         mnt_userns = mnt_user_ns(nd->path.mnt);
    3176        3356 :         if (open_flag & O_CREAT) {
    3177         636 :                 if (open_flag & O_EXCL)
    3178         313 :                         open_flag &= ~O_TRUNC;
    3179         636 :                 if (!IS_POSIXACL(dir->d_inode))
    3180         636 :                         mode &= ~current_umask();
    3181         636 :                 if (likely(got_write))
    3182         635 :                         create_error = may_o_create(mnt_userns, &nd->path,
    3183             :                                                     dentry, mode);
    3184             :                 else
    3185             :                         create_error = -EROFS;
    3186             :         }
    3187        3356 :         if (create_error)
    3188           5 :                 open_flag &= ~O_CREAT;
    3189        3356 :         if (dir_inode->i_op->atomic_open) {
    3190           0 :                 dentry = atomic_open(nd, dentry, file, open_flag, mode);
    3191           0 :                 if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
    3192           0 :                         dentry = ERR_PTR(create_error);
    3193           0 :                 return dentry;
    3194             :         }
    3195             : 
    3196        3356 :         if (d_in_lookup(dentry)) {
    3197        3349 :                 struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
    3198             :                                                              nd->flags);
    3199        3349 :                 d_lookup_done(dentry);
    3200        3349 :                 if (unlikely(res)) {
    3201         163 :                         if (IS_ERR(res)) {
    3202         163 :                                 error = PTR_ERR(res);
    3203         163 :                                 goto out_dput;
    3204             :                         }
    3205           0 :                         dput(dentry);
    3206           0 :                         dentry = res;
    3207             :                 }
    3208             :         }
    3209             : 
    3210             :         /* Negative dentry, just create the file */
    3211        3193 :         if (!dentry->d_inode && (open_flag & O_CREAT)) {
    3212         620 :                 file->f_mode |= FMODE_CREATED;
    3213         620 :                 audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
    3214         620 :                 if (!dir_inode->i_op->create) {
    3215           0 :                         error = -EACCES;
    3216           0 :                         goto out_dput;
    3217             :                 }
    3218             : 
    3219        1240 :                 error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
    3220         620 :                                                 mode, open_flag & O_EXCL);
    3221         620 :                 if (error)
    3222           0 :                         goto out_dput;
    3223             :         }
    3224        3193 :         if (unlikely(create_error) && !dentry->d_inode) {
    3225           1 :                 error = create_error;
    3226           1 :                 goto out_dput;
    3227             :         }
    3228             :         return dentry;
    3229             : 
    3230         164 : out_dput:
    3231         164 :         dput(dentry);
    3232         164 :         return ERR_PTR(error);
    3233             : }
    3234             : 
    3235       25191 : static const char *open_last_lookups(struct nameidata *nd,
    3236             :                    struct file *file, const struct open_flags *op)
    3237             : {
    3238       25191 :         struct dentry *dir = nd->path.dentry;
    3239       25191 :         int open_flag = op->open_flag;
    3240       25191 :         bool got_write = false;
    3241       25191 :         unsigned seq;
    3242       25191 :         struct inode *inode;
    3243       25191 :         struct dentry *dentry;
    3244       25191 :         const char *res;
    3245             : 
    3246       25191 :         nd->flags |= op->intent;
    3247             : 
    3248       25191 :         if (nd->last_type != LAST_NORM) {
    3249          38 :                 if (nd->depth)
    3250           0 :                         put_link(nd);
    3251          38 :                 return handle_dots(nd, nd->last_type);
    3252             :         }
    3253             : 
    3254       25153 :         if (!(open_flag & O_CREAT)) {
    3255       24408 :                 if (nd->last.name[nd->last.len])
    3256         130 :                         nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
    3257             :                 /* we _can_ be in RCU mode here */
    3258       24408 :                 dentry = lookup_fast(nd, &inode, &seq);
    3259       24411 :                 if (IS_ERR(dentry))
    3260       25192 :                         return ERR_CAST(dentry);
    3261       24409 :                 if (likely(dentry))
    3262       21689 :                         goto finish_lookup;
    3263             : 
    3264        2720 :                 BUG_ON(nd->flags & LOOKUP_RCU);
    3265             :         } else {
    3266             :                 /* create side of things */
    3267         745 :                 if (nd->flags & LOOKUP_RCU) {
    3268         738 :                         if (!try_to_unlazy(nd))
    3269       25192 :                                 return ERR_PTR(-ECHILD);
    3270             :                 }
    3271         745 :                 audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
    3272             :                 /* trailing slashes? */
    3273         745 :                 if (unlikely(nd->last.name[nd->last.len]))
    3274       25192 :                         return ERR_PTR(-EISDIR);
    3275             :         }
    3276             : 
    3277        3465 :         if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
    3278         904 :                 got_write = !mnt_want_write(nd->path.mnt);
    3279             :                 /*
    3280             :                  * do _not_ fail yet - we might not need that or fail with
    3281             :                  * a different error; let lookup_open() decide; we'll be
    3282             :                  * dropping this one anyway.
    3283             :                  */
    3284             :         }
    3285        3465 :         if (open_flag & O_CREAT)
    3286         745 :                 inode_lock(dir->d_inode);
    3287             :         else
    3288        2720 :                 inode_lock_shared(dir->d_inode);
    3289        3465 :         dentry = lookup_open(nd, file, op, got_write);
    3290        3465 :         if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
    3291         620 :                 fsnotify_create(dir->d_inode, dentry);
    3292        3465 :         if (open_flag & O_CREAT)
    3293         745 :                 inode_unlock(dir->d_inode);
    3294             :         else
    3295        2720 :                 inode_unlock_shared(dir->d_inode);
    3296             : 
    3297        3465 :         if (got_write)
    3298         902 :                 mnt_drop_write(nd->path.mnt);
    3299             : 
    3300        3465 :         if (IS_ERR(dentry))
    3301       25192 :                 return ERR_CAST(dentry);
    3302             : 
    3303        3301 :         if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
    3304         620 :                 dput(nd->path.dentry);
    3305         620 :                 nd->path.dentry = dentry;
    3306         620 :                 return NULL;
    3307             :         }
    3308             : 
    3309        2681 : finish_lookup:
    3310       24370 :         if (nd->depth)
    3311        5671 :                 put_link(nd);
    3312       24370 :         res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
    3313       24368 :         if (unlikely(res))
    3314        9424 :                 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
    3315             :         return res;
    3316             : }
    3317             : 
    3318             : /*
    3319             :  * Handle the last step of open()
    3320             :  */
    3321       15603 : static int do_open(struct nameidata *nd,
    3322             :                    struct file *file, const struct open_flags *op)
    3323             : {
    3324       15603 :         struct user_namespace *mnt_userns;
    3325       15603 :         int open_flag = op->open_flag;
    3326       15603 :         bool do_truncate;
    3327       15603 :         int acc_mode;
    3328       15603 :         int error;
    3329             : 
    3330       15603 :         if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
    3331       14983 :                 error = complete_walk(nd);
    3332       14982 :                 if (error)
    3333             :                         return error;
    3334             :         }
    3335       15595 :         if (!(file->f_mode & FMODE_CREATED))
    3336       15595 :                 audit_inode(nd->name, nd->path.dentry, 0);
    3337       15595 :         mnt_userns = mnt_user_ns(nd->path.mnt);
    3338       15595 :         if (open_flag & O_CREAT) {
    3339         741 :                 if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
    3340             :                         return -EEXIST;
    3341        1476 :                 if (d_is_dir(nd->path.dentry))
    3342             :                         return -EISDIR;
    3343        1476 :                 error = may_create_in_sticky(mnt_userns, nd,
    3344         738 :                                              d_backing_inode(nd->path.dentry));
    3345         738 :                 if (unlikely(error))
    3346             :                         return error;
    3347             :         }
    3348       15592 :         if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
    3349             :                 return -ENOTDIR;
    3350             : 
    3351       15592 :         do_truncate = false;
    3352       15592 :         acc_mode = op->acc_mode;
    3353       15592 :         if (file->f_mode & FMODE_CREATED) {
    3354             :                 /* Don't check for write permission, don't truncate */
    3355         620 :                 open_flag &= ~O_TRUNC;
    3356         620 :                 acc_mode = 0;
    3357       14972 :         } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
    3358          10 :                 error = mnt_want_write(nd->path.mnt);
    3359          10 :                 if (error)
    3360             :                         return error;
    3361             :                 do_truncate = true;
    3362             :         }
    3363       15592 :         error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
    3364       15594 :         if (!error && !(file->f_mode & FMODE_OPENED))
    3365       15581 :                 error = vfs_open(&nd->path, file);
    3366       15592 :         if (!error)
    3367       15361 :                 error = ima_file_check(file, op->acc_mode);
    3368       15592 :         if (!error && do_truncate)
    3369          10 :                 error = handle_truncate(mnt_userns, file);
    3370       15592 :         if (unlikely(error > 0)) {
    3371           0 :                 WARN_ON(1);
    3372           0 :                 error = -EINVAL;
    3373             :         }
    3374       15592 :         if (do_truncate)
    3375          10 :                 mnt_drop_write(nd->path.mnt);
    3376             :         return error;
    3377             : }
    3378             : 
    3379             : /**
    3380             :  * vfs_tmpfile - create tmpfile
    3381             :  * @mnt_userns: user namespace of the mount the inode was found from
    3382             :  * @dentry:     pointer to dentry of the base directory
    3383             :  * @mode:       mode of the new tmpfile
    3384             :  * @open_flags: flags
    3385             :  *
    3386             :  * Create a temporary file.
    3387             :  *
    3388             :  * If the inode has been found through an idmapped mount the user namespace of
    3389             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    3390             :  * care to map the inode according to @mnt_userns before checking permissions.
    3391             :  * On non-idmapped mounts or if permission checking is to be performed on the
    3392             :  * raw inode simply passs init_user_ns.
    3393             :  */
    3394           5 : struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
    3395             :                            struct dentry *dentry, umode_t mode, int open_flag)
    3396             : {
    3397           5 :         struct dentry *child = NULL;
    3398           5 :         struct inode *dir = dentry->d_inode;
    3399           5 :         struct inode *inode;
    3400           5 :         int error;
    3401             : 
    3402             :         /* we want directory to be writable */
    3403           5 :         error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
    3404           5 :         if (error)
    3405           0 :                 goto out_err;
    3406           5 :         error = -EOPNOTSUPP;
    3407           5 :         if (!dir->i_op->tmpfile)
    3408           0 :                 goto out_err;
    3409           5 :         error = -ENOMEM;
    3410           5 :         child = d_alloc(dentry, &slash_name);
    3411           5 :         if (unlikely(!child))
    3412           0 :                 goto out_err;
    3413           5 :         error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
    3414           5 :         if (error)
    3415           0 :                 goto out_err;
    3416           5 :         error = -ENOENT;
    3417           5 :         inode = child->d_inode;
    3418           5 :         if (unlikely(!inode))
    3419           0 :                 goto out_err;
    3420           5 :         if (!(open_flag & O_EXCL)) {
    3421           5 :                 spin_lock(&inode->i_lock);
    3422           5 :                 inode->i_state |= I_LINKABLE;
    3423           5 :                 spin_unlock(&inode->i_lock);
    3424             :         }
    3425           5 :         ima_post_create_tmpfile(mnt_userns, inode);
    3426             :         return child;
    3427             : 
    3428           0 : out_err:
    3429           0 :         dput(child);
    3430           0 :         return ERR_PTR(error);
    3431             : }
    3432             : EXPORT_SYMBOL(vfs_tmpfile);
    3433             : 
    3434           0 : static int do_tmpfile(struct nameidata *nd, unsigned flags,
    3435             :                 const struct open_flags *op,
    3436             :                 struct file *file)
    3437             : {
    3438           0 :         struct user_namespace *mnt_userns;
    3439           0 :         struct dentry *child;
    3440           0 :         struct path path;
    3441           0 :         int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
    3442           0 :         if (unlikely(error))
    3443             :                 return error;
    3444           0 :         error = mnt_want_write(path.mnt);
    3445           0 :         if (unlikely(error))
    3446           0 :                 goto out;
    3447           0 :         mnt_userns = mnt_user_ns(path.mnt);
    3448           0 :         child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
    3449           0 :         error = PTR_ERR(child);
    3450           0 :         if (IS_ERR(child))
    3451           0 :                 goto out2;
    3452           0 :         dput(path.dentry);
    3453           0 :         path.dentry = child;
    3454           0 :         audit_inode(nd->name, child, 0);
    3455             :         /* Don't check for other permissions, the inode was just created */
    3456           0 :         error = may_open(mnt_userns, &path, 0, op->open_flag);
    3457           0 :         if (!error)
    3458           0 :                 error = vfs_open(&path, file);
    3459           0 : out2:
    3460           0 :         mnt_drop_write(path.mnt);
    3461           0 : out:
    3462           0 :         path_put(&path);
    3463           0 :         return error;
    3464             : }
    3465             : 
    3466       36333 : static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
    3467             : {
    3468       36333 :         struct path path;
    3469       36333 :         int error = path_lookupat(nd, flags, &path);
    3470       36324 :         if (!error) {
    3471       30248 :                 audit_inode(nd->name, path.dentry, 0);
    3472       30248 :                 error = vfs_open(&path, file);
    3473       30265 :                 path_put(&path);
    3474             :         }
    3475       36338 :         return error;
    3476             : }
    3477             : 
    3478       60269 : static struct file *path_openat(struct nameidata *nd,
    3479             :                         const struct open_flags *op, unsigned flags)
    3480             : {
    3481       60269 :         struct file *file;
    3482       60269 :         int error;
    3483             : 
    3484       60269 :         file = alloc_empty_file(op->open_flag, current_cred());
    3485       60308 :         if (IS_ERR(file))
    3486             :                 return file;
    3487             : 
    3488       60308 :         if (unlikely(file->f_flags & __O_TMPFILE)) {
    3489           0 :                 error = do_tmpfile(nd, flags, op, file);
    3490       60308 :         } else if (unlikely(file->f_flags & O_PATH)) {
    3491       36333 :                 error = do_o_path(nd, flags, file);
    3492             :         } else {
    3493       23975 :                 const char *s = path_init(nd, flags);
    3494       57536 :                 while (!(error = link_path_walk(s, nd)) &&
    3495       25191 :                        (s = open_last_lookups(nd, file, op)) != NULL)
    3496       33562 :                         ;
    3497       23973 :                 if (!error)
    3498       15603 :                         error = do_open(nd, file, op);
    3499       23973 :                 terminate_walk(nd);
    3500             :         }
    3501       60297 :         if (likely(!error)) {
    3502       45619 :                 if (likely(file->f_mode & FMODE_OPENED))
    3503             :                         return file;
    3504           0 :                 WARN_ON(1);
    3505           0 :                 error = -EINVAL;
    3506             :         }
    3507       14678 :         fput(file);
    3508       14679 :         if (error == -EOPENSTALE) {
    3509           0 :                 if (flags & LOOKUP_RCU)
    3510             :                         error = -ECHILD;
    3511             :                 else
    3512           0 :                         error = -ESTALE;
    3513             :         }
    3514       14679 :         return ERR_PTR(error);
    3515             : }
    3516             : 
    3517       60211 : struct file *do_filp_open(int dfd, struct filename *pathname,
    3518             :                 const struct open_flags *op)
    3519             : {
    3520       60211 :         struct nameidata nd;
    3521       60211 :         int flags = op->lookup_flags;
    3522       60211 :         struct file *filp;
    3523             : 
    3524       60211 :         set_nameidata(&nd, dfd, pathname);
    3525       60211 :         filp = path_openat(&nd, op, flags | LOOKUP_RCU);
    3526       60219 :         if (unlikely(filp == ERR_PTR(-ECHILD)))
    3527          76 :                 filp = path_openat(&nd, op, flags);
    3528       60219 :         if (unlikely(filp == ERR_PTR(-ESTALE)))
    3529           0 :                 filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
    3530       60219 :         restore_nameidata();
    3531       60221 :         return filp;
    3532             : }
    3533             : 
    3534           0 : struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
    3535             :                 const char *name, const struct open_flags *op)
    3536             : {
    3537           0 :         struct nameidata nd;
    3538           0 :         struct file *file;
    3539           0 :         struct filename *filename;
    3540           0 :         int flags = op->lookup_flags | LOOKUP_ROOT;
    3541             : 
    3542           0 :         nd.root.mnt = mnt;
    3543           0 :         nd.root.dentry = dentry;
    3544             : 
    3545           0 :         if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
    3546           0 :                 return ERR_PTR(-ELOOP);
    3547             : 
    3548           0 :         filename = getname_kernel(name);
    3549           0 :         if (IS_ERR(filename))
    3550           0 :                 return ERR_CAST(filename);
    3551             : 
    3552           0 :         set_nameidata(&nd, -1, filename);
    3553           0 :         file = path_openat(&nd, op, flags | LOOKUP_RCU);
    3554           0 :         if (unlikely(file == ERR_PTR(-ECHILD)))
    3555           0 :                 file = path_openat(&nd, op, flags);
    3556           0 :         if (unlikely(file == ERR_PTR(-ESTALE)))
    3557           0 :                 file = path_openat(&nd, op, flags | LOOKUP_REVAL);
    3558           0 :         restore_nameidata();
    3559           0 :         putname(filename);
    3560           0 :         return file;
    3561             : }
    3562             : 
    3563        4979 : static struct dentry *filename_create(int dfd, struct filename *name,
    3564             :                                 struct path *path, unsigned int lookup_flags)
    3565             : {
    3566        4979 :         struct dentry *dentry = ERR_PTR(-EEXIST);
    3567        4979 :         struct qstr last;
    3568        4979 :         int type;
    3569        4979 :         int err2;
    3570        4979 :         int error;
    3571        4979 :         bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
    3572             : 
    3573             :         /*
    3574             :          * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
    3575             :          * other flags passed in are ignored!
    3576             :          */
    3577        4979 :         lookup_flags &= LOOKUP_REVAL;
    3578             : 
    3579        4979 :         name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
    3580        4979 :         if (IS_ERR(name))
    3581        4979 :                 return ERR_CAST(name);
    3582             : 
    3583             :         /*
    3584             :          * Yucky last component or no last component at all?
    3585             :          * (foo/., foo/.., /////)
    3586             :          */
    3587        4325 :         if (unlikely(type != LAST_NORM))
    3588           1 :                 goto out;
    3589             : 
    3590             :         /* don't fail immediately if it's r/o, at least try to report other errors */
    3591        4324 :         err2 = mnt_want_write(path->mnt);
    3592             :         /*
    3593             :          * Do the final lookup.
    3594             :          */
    3595        4324 :         lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
    3596        4324 :         inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
    3597        4324 :         dentry = __lookup_hash(&last, path->dentry, lookup_flags);
    3598        4324 :         if (IS_ERR(dentry))
    3599           0 :                 goto unlock;
    3600             : 
    3601        4324 :         error = -EEXIST;
    3602        4324 :         if (d_is_positive(dentry))
    3603        2657 :                 goto fail;
    3604             : 
    3605             :         /*
    3606             :          * Special case - lookup gave negative, but... we had foo/bar/
    3607             :          * From the vfs_mknod() POV we just have a negative dentry -
    3608             :          * all is fine. Let's be bastards - you had / on the end, you've
    3609             :          * been asking for (non-existent) directory. -ENOENT for you.
    3610             :          */
    3611        1667 :         if (unlikely(!is_dir && last.name[last.len])) {
    3612           0 :                 error = -ENOENT;
    3613           0 :                 goto fail;
    3614             :         }
    3615        1667 :         if (unlikely(err2)) {
    3616           0 :                 error = err2;
    3617           0 :                 goto fail;
    3618             :         }
    3619        1667 :         putname(name);
    3620        1667 :         return dentry;
    3621        2657 : fail:
    3622        2657 :         dput(dentry);
    3623        2657 :         dentry = ERR_PTR(error);
    3624        2657 : unlock:
    3625        2657 :         inode_unlock(path->dentry->d_inode);
    3626        2657 :         if (!err2)
    3627        2644 :                 mnt_drop_write(path->mnt);
    3628          13 : out:
    3629        2658 :         path_put(path);
    3630        2658 :         putname(name);
    3631        2658 :         return dentry;
    3632             : }
    3633             : 
    3634         136 : struct dentry *kern_path_create(int dfd, const char *pathname,
    3635             :                                 struct path *path, unsigned int lookup_flags)
    3636             : {
    3637         136 :         return filename_create(dfd, getname_kernel(pathname),
    3638             :                                 path, lookup_flags);
    3639             : }
    3640             : EXPORT_SYMBOL(kern_path_create);
    3641             : 
    3642        1667 : void done_path_create(struct path *path, struct dentry *dentry)
    3643             : {
    3644        1667 :         dput(dentry);
    3645        1667 :         inode_unlock(path->dentry->d_inode);
    3646        1667 :         mnt_drop_write(path->mnt);
    3647        1667 :         path_put(path);
    3648        1667 : }
    3649             : EXPORT_SYMBOL(done_path_create);
    3650             : 
    3651        4843 : inline struct dentry *user_path_create(int dfd, const char __user *pathname,
    3652             :                                 struct path *path, unsigned int lookup_flags)
    3653             : {
    3654        4843 :         return filename_create(dfd, getname(pathname), path, lookup_flags);
    3655             : }
    3656             : EXPORT_SYMBOL(user_path_create);
    3657             : 
    3658             : /**
    3659             :  * vfs_mknod - create device node or file
    3660             :  * @mnt_userns: user namespace of the mount the inode was found from
    3661             :  * @dir:        inode of @dentry
    3662             :  * @dentry:     pointer to dentry of the base directory
    3663             :  * @mode:       mode of the new device node or file
    3664             :  * @dev:        device number of device to create
    3665             :  *
    3666             :  * Create a device node or file.
    3667             :  *
    3668             :  * If the inode has been found through an idmapped mount the user namespace of
    3669             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    3670             :  * care to map the inode according to @mnt_userns before checking permissions.
    3671             :  * On non-idmapped mounts or if permission checking is to be performed on the
    3672             :  * raw inode simply passs init_user_ns.
    3673             :  */
    3674         167 : int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
    3675             :               struct dentry *dentry, umode_t mode, dev_t dev)
    3676             : {
    3677         167 :         bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
    3678         167 :         int error = may_create(mnt_userns, dir, dentry);
    3679             : 
    3680         167 :         if (error)
    3681             :                 return error;
    3682             : 
    3683         302 :         if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
    3684         135 :             !capable(CAP_MKNOD))
    3685             :                 return -EPERM;
    3686             : 
    3687         167 :         if (!dir->i_op->mknod)
    3688             :                 return -EPERM;
    3689             : 
    3690         167 :         error = devcgroup_inode_mknod(mode, dev);
    3691         167 :         if (error)
    3692             :                 return error;
    3693             : 
    3694         167 :         error = security_inode_mknod(dir, dentry, mode, dev);
    3695         167 :         if (error)
    3696             :                 return error;
    3697             : 
    3698         167 :         error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
    3699         167 :         if (!error)
    3700         167 :                 fsnotify_create(dir, dentry);
    3701             :         return error;
    3702             : }
    3703             : EXPORT_SYMBOL(vfs_mknod);
    3704             : 
    3705         512 : static int may_mknod(umode_t mode)
    3706             : {
    3707         512 :         switch (mode & S_IFMT) {
    3708             :         case S_IFREG:
    3709             :         case S_IFCHR:
    3710             :         case S_IFBLK:
    3711             :         case S_IFIFO:
    3712             :         case S_IFSOCK:
    3713             :         case 0: /* zero mode translates to S_IFREG */
    3714             :                 return 0;
    3715           0 :         case S_IFDIR:
    3716           0 :                 return -EPERM;
    3717           0 :         default:
    3718           0 :                 return -EINVAL;
    3719             :         }
    3720             : }
    3721             : 
    3722         512 : static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
    3723             :                 unsigned int dev)
    3724             : {
    3725         512 :         struct user_namespace *mnt_userns;
    3726         512 :         struct dentry *dentry;
    3727         512 :         struct path path;
    3728         512 :         int error;
    3729         512 :         unsigned int lookup_flags = 0;
    3730             : 
    3731         512 :         error = may_mknod(mode);
    3732         512 :         if (error)
    3733           0 :                 return error;
    3734         512 : retry:
    3735         512 :         dentry = user_path_create(dfd, filename, &path, lookup_flags);
    3736         512 :         if (IS_ERR(dentry))
    3737           0 :                 return PTR_ERR(dentry);
    3738             : 
    3739         512 :         if (!IS_POSIXACL(path.dentry->d_inode))
    3740         512 :                 mode &= ~current_umask();
    3741         512 :         error = security_path_mknod(&path, dentry, mode, dev);
    3742         512 :         if (error)
    3743           8 :                 goto out;
    3744             : 
    3745         504 :         mnt_userns = mnt_user_ns(path.mnt);
    3746         504 :         switch (mode & S_IFMT) {
    3747         469 :                 case 0: case S_IFREG:
    3748         469 :                         error = vfs_create(mnt_userns, path.dentry->d_inode,
    3749             :                                            dentry, mode, true);
    3750         469 :                         if (!error)
    3751         512 :                                 ima_post_path_mknod(mnt_userns, dentry);
    3752             :                         break;
    3753             :                 case S_IFCHR: case S_IFBLK:
    3754          22 :                         error = vfs_mknod(mnt_userns, path.dentry->d_inode,
    3755             :                                           dentry, mode, new_decode_dev(dev));
    3756          22 :                         break;
    3757          13 :                 case S_IFIFO: case S_IFSOCK:
    3758          13 :                         error = vfs_mknod(mnt_userns, path.dentry->d_inode,
    3759             :                                           dentry, mode, 0);
    3760          13 :                         break;
    3761             :         }
    3762         512 : out:
    3763         512 :         done_path_create(&path, dentry);
    3764        1024 :         if (retry_estale(error, lookup_flags)) {
    3765           0 :                 lookup_flags |= LOOKUP_REVAL;
    3766           0 :                 goto retry;
    3767             :         }
    3768             :         return error;
    3769             : }
    3770             : 
    3771           0 : SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
    3772             :                 unsigned int, dev)
    3773             : {
    3774           0 :         return do_mknodat(dfd, filename, mode, dev);
    3775             : }
    3776             : 
    3777        1024 : SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
    3778             : {
    3779         512 :         return do_mknodat(AT_FDCWD, filename, mode, dev);
    3780             : }
    3781             : 
    3782             : /**
    3783             :  * vfs_mkdir - create directory
    3784             :  * @mnt_userns: user namespace of the mount the inode was found from
    3785             :  * @dir:        inode of @dentry
    3786             :  * @dentry:     pointer to dentry of the base directory
    3787             :  * @mode:       mode of the new directory
    3788             :  *
    3789             :  * Create a directory.
    3790             :  *
    3791             :  * If the inode has been found through an idmapped mount the user namespace of
    3792             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    3793             :  * care to map the inode according to @mnt_userns before checking permissions.
    3794             :  * On non-idmapped mounts or if permission checking is to be performed on the
    3795             :  * raw inode simply passs init_user_ns.
    3796             :  */
    3797         789 : int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
    3798             :               struct dentry *dentry, umode_t mode)
    3799             : {
    3800         789 :         int error = may_create(mnt_userns, dir, dentry);
    3801         789 :         unsigned max_links = dir->i_sb->s_max_links;
    3802             : 
    3803         789 :         if (error)
    3804             :                 return error;
    3805             : 
    3806         789 :         if (!dir->i_op->mkdir)
    3807             :                 return -EPERM;
    3808             : 
    3809         789 :         mode &= (S_IRWXUGO|S_ISVTX);
    3810         789 :         error = security_inode_mkdir(dir, dentry, mode);
    3811         789 :         if (error)
    3812             :                 return error;
    3813             : 
    3814         789 :         if (max_links && dir->i_nlink >= max_links)
    3815             :                 return -EMLINK;
    3816             : 
    3817         789 :         error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
    3818         789 :         if (!error)
    3819         785 :                 fsnotify_mkdir(dir, dentry);
    3820             :         return error;
    3821             : }
    3822             : EXPORT_SYMBOL(vfs_mkdir);
    3823             : 
    3824        4085 : static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
    3825             : {
    3826        4085 :         struct dentry *dentry;
    3827        4085 :         struct path path;
    3828        4085 :         int error;
    3829        4085 :         unsigned int lookup_flags = LOOKUP_DIRECTORY;
    3830             : 
    3831        4085 : retry:
    3832        4085 :         dentry = user_path_create(dfd, pathname, &path, lookup_flags);
    3833        4085 :         if (IS_ERR(dentry))
    3834        3301 :                 return PTR_ERR(dentry);
    3835             : 
    3836         784 :         if (!IS_POSIXACL(path.dentry->d_inode))
    3837         784 :                 mode &= ~current_umask();
    3838         784 :         error = security_path_mkdir(&path, dentry, mode);
    3839         784 :         if (!error) {
    3840         783 :                 struct user_namespace *mnt_userns;
    3841         783 :                 mnt_userns = mnt_user_ns(path.mnt);
    3842         783 :                 error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
    3843             :                                   mode);
    3844             :         }
    3845         784 :         done_path_create(&path, dentry);
    3846        1568 :         if (retry_estale(error, lookup_flags)) {
    3847           0 :                 lookup_flags |= LOOKUP_REVAL;
    3848           0 :                 goto retry;
    3849             :         }
    3850             :         return error;
    3851             : }
    3852             : 
    3853          78 : SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
    3854             : {
    3855          39 :         return do_mkdirat(dfd, pathname, mode);
    3856             : }
    3857             : 
    3858        8092 : SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
    3859             : {
    3860        4046 :         return do_mkdirat(AT_FDCWD, pathname, mode);
    3861             : }
    3862             : 
    3863             : /**
    3864             :  * vfs_rmdir - remove directory
    3865             :  * @mnt_userns: user namespace of the mount the inode was found from
    3866             :  * @dir:        inode of @dentry
    3867             :  * @dentry:     pointer to dentry of the base directory
    3868             :  *
    3869             :  * Remove a directory.
    3870             :  *
    3871             :  * If the inode has been found through an idmapped mount the user namespace of
    3872             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    3873             :  * care to map the inode according to @mnt_userns before checking permissions.
    3874             :  * On non-idmapped mounts or if permission checking is to be performed on the
    3875             :  * raw inode simply passs init_user_ns.
    3876             :  */
    3877        1066 : int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
    3878             :                      struct dentry *dentry)
    3879             : {
    3880        1066 :         int error = may_delete(mnt_userns, dir, dentry, 1);
    3881             : 
    3882        1066 :         if (error)
    3883             :                 return error;
    3884             : 
    3885        1066 :         if (!dir->i_op->rmdir)
    3886             :                 return -EPERM;
    3887             : 
    3888        1066 :         dget(dentry);
    3889        1066 :         inode_lock(dentry->d_inode);
    3890             : 
    3891        1066 :         error = -EBUSY;
    3892        1066 :         if (is_local_mountpoint(dentry))
    3893         185 :                 goto out;
    3894             : 
    3895         881 :         error = security_inode_rmdir(dir, dentry);
    3896         881 :         if (error)
    3897           0 :                 goto out;
    3898             : 
    3899         881 :         error = dir->i_op->rmdir(dir, dentry);
    3900         881 :         if (error)
    3901         333 :                 goto out;
    3902             : 
    3903         548 :         shrink_dcache_parent(dentry);
    3904         548 :         dentry->d_inode->i_flags |= S_DEAD;
    3905         548 :         dont_mount(dentry);
    3906         548 :         detach_mounts(dentry);
    3907         548 :         fsnotify_rmdir(dir, dentry);
    3908             : 
    3909        1066 : out:
    3910        1066 :         inode_unlock(dentry->d_inode);
    3911        1066 :         dput(dentry);
    3912        1066 :         if (!error)
    3913         548 :                 d_delete(dentry);
    3914             :         return error;
    3915             : }
    3916             : EXPORT_SYMBOL(vfs_rmdir);
    3917             : 
    3918        1113 : long do_rmdir(int dfd, struct filename *name)
    3919             : {
    3920        1113 :         struct user_namespace *mnt_userns;
    3921        1113 :         int error = 0;
    3922        1113 :         struct dentry *dentry;
    3923        1113 :         struct path path;
    3924        1113 :         struct qstr last;
    3925        1113 :         int type;
    3926        1113 :         unsigned int lookup_flags = 0;
    3927        1113 : retry:
    3928        1113 :         name = filename_parentat(dfd, name, lookup_flags,
    3929             :                                 &path, &last, &type);
    3930        1113 :         if (IS_ERR(name))
    3931          38 :                 return PTR_ERR(name);
    3932             : 
    3933        1075 :         switch (type) {
    3934             :         case LAST_DOTDOT:
    3935             :                 error = -ENOTEMPTY;
    3936             :                 goto exit1;
    3937             :         case LAST_DOT:
    3938             :                 error = -EINVAL;
    3939             :                 goto exit1;
    3940             :         case LAST_ROOT:
    3941             :                 error = -EBUSY;
    3942             :                 goto exit1;
    3943             :         }
    3944             : 
    3945        1075 :         error = mnt_want_write(path.mnt);
    3946        1075 :         if (error)
    3947           0 :                 goto exit1;
    3948             : 
    3949        1075 :         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
    3950        1075 :         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
    3951        1075 :         error = PTR_ERR(dentry);
    3952        1075 :         if (IS_ERR(dentry))
    3953           0 :                 goto exit2;
    3954        1075 :         if (!dentry->d_inode) {
    3955           5 :                 error = -ENOENT;
    3956           5 :                 goto exit3;
    3957             :         }
    3958        1070 :         error = security_path_rmdir(&path, dentry);
    3959        1070 :         if (error)
    3960           4 :                 goto exit3;
    3961        1066 :         mnt_userns = mnt_user_ns(path.mnt);
    3962        1066 :         error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
    3963        1075 : exit3:
    3964        1075 :         dput(dentry);
    3965        1075 : exit2:
    3966        1075 :         inode_unlock(path.dentry->d_inode);
    3967        1075 :         mnt_drop_write(path.mnt);
    3968        1075 : exit1:
    3969        1075 :         path_put(&path);
    3970        2150 :         if (retry_estale(error, lookup_flags)) {
    3971           0 :                 lookup_flags |= LOOKUP_REVAL;
    3972           0 :                 goto retry;
    3973             :         }
    3974        1075 :         putname(name);
    3975        1075 :         return error;
    3976             : }
    3977             : 
    3978        2182 : SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
    3979             : {
    3980        1091 :         return do_rmdir(AT_FDCWD, getname(pathname));
    3981             : }
    3982             : 
    3983             : /**
    3984             :  * vfs_unlink - unlink a filesystem object
    3985             :  * @mnt_userns: user namespace of the mount the inode was found from
    3986             :  * @dir:        parent directory
    3987             :  * @dentry:     victim
    3988             :  * @delegated_inode: returns victim inode, if the inode is delegated.
    3989             :  *
    3990             :  * The caller must hold dir->i_mutex.
    3991             :  *
    3992             :  * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
    3993             :  * return a reference to the inode in delegated_inode.  The caller
    3994             :  * should then break the delegation on that inode and retry.  Because
    3995             :  * breaking a delegation may take a long time, the caller should drop
    3996             :  * dir->i_mutex before doing so.
    3997             :  *
    3998             :  * Alternatively, a caller may pass NULL for delegated_inode.  This may
    3999             :  * be appropriate for callers that expect the underlying filesystem not
    4000             :  * to be NFS exported.
    4001             :  *
    4002             :  * If the inode has been found through an idmapped mount the user namespace of
    4003             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    4004             :  * care to map the inode according to @mnt_userns before checking permissions.
    4005             :  * On non-idmapped mounts or if permission checking is to be performed on the
    4006             :  * raw inode simply passs init_user_ns.
    4007             :  */
    4008         796 : int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
    4009             :                struct dentry *dentry, struct inode **delegated_inode)
    4010             : {
    4011         796 :         struct inode *target = dentry->d_inode;
    4012         796 :         int error = may_delete(mnt_userns, dir, dentry, 0);
    4013             : 
    4014         796 :         if (error)
    4015             :                 return error;
    4016             : 
    4017         654 :         if (!dir->i_op->unlink)
    4018             :                 return -EPERM;
    4019             : 
    4020         654 :         inode_lock(target);
    4021         654 :         if (is_local_mountpoint(dentry))
    4022             :                 error = -EBUSY;
    4023             :         else {
    4024         654 :                 error = security_inode_unlink(dir, dentry);
    4025         654 :                 if (!error) {
    4026         654 :                         error = try_break_deleg(target, delegated_inode);
    4027         654 :                         if (error)
    4028           0 :                                 goto out;
    4029         654 :                         error = dir->i_op->unlink(dir, dentry);
    4030         654 :                         if (!error) {
    4031         654 :                                 dont_mount(dentry);
    4032         654 :                                 detach_mounts(dentry);
    4033         654 :                                 fsnotify_unlink(dir, dentry);
    4034             :                         }
    4035             :                 }
    4036             :         }
    4037           0 : out:
    4038         654 :         inode_unlock(target);
    4039             : 
    4040             :         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
    4041         654 :         if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
    4042         654 :                 fsnotify_link_count(target);
    4043         654 :                 d_delete(dentry);
    4044             :         }
    4045             : 
    4046             :         return error;
    4047             : }
    4048             : EXPORT_SYMBOL(vfs_unlink);
    4049             : 
    4050             : /*
    4051             :  * Make sure that the actual truncation of the file will occur outside its
    4052             :  * directory's i_mutex.  Truncate can take a long time if there is a lot of
    4053             :  * writeout happening, and we don't want to prevent access to the directory
    4054             :  * while waiting on the I/O.
    4055             :  */
    4056         918 : long do_unlinkat(int dfd, struct filename *name)
    4057             : {
    4058         918 :         int error;
    4059         918 :         struct dentry *dentry;
    4060         918 :         struct path path;
    4061         918 :         struct qstr last;
    4062         918 :         int type;
    4063         918 :         struct inode *inode = NULL;
    4064         918 :         struct inode *delegated_inode = NULL;
    4065         918 :         unsigned int lookup_flags = 0;
    4066         918 : retry:
    4067         918 :         name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
    4068         918 :         if (IS_ERR(name))
    4069          39 :                 return PTR_ERR(name);
    4070             : 
    4071         879 :         error = -EISDIR;
    4072         879 :         if (type != LAST_NORM)
    4073           0 :                 goto exit1;
    4074             : 
    4075         879 :         error = mnt_want_write(path.mnt);
    4076         879 :         if (error)
    4077           1 :                 goto exit1;
    4078         878 : retry_deleg:
    4079         878 :         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
    4080         878 :         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
    4081         878 :         error = PTR_ERR(dentry);
    4082         878 :         if (!IS_ERR(dentry)) {
    4083         878 :                 struct user_namespace *mnt_userns;
    4084             : 
    4085             :                 /* Why not before? Because we want correct error value */
    4086         878 :                 if (last.name[last.len])
    4087           0 :                         goto slashes;
    4088         878 :                 inode = dentry->d_inode;
    4089         878 :                 if (d_is_negative(dentry))
    4090          83 :                         goto slashes;
    4091         795 :                 ihold(inode);
    4092         795 :                 error = security_path_unlink(&path, dentry);
    4093         795 :                 if (error)
    4094           3 :                         goto exit2;
    4095         792 :                 mnt_userns = mnt_user_ns(path.mnt);
    4096         792 :                 error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
    4097             :                                    &delegated_inode);
    4098         878 : exit2:
    4099         878 :                 dput(dentry);
    4100             :         }
    4101         878 :         inode_unlock(path.dentry->d_inode);
    4102         878 :         if (inode)
    4103         795 :                 iput(inode);    /* truncate the inode here */
    4104         878 :         inode = NULL;
    4105         878 :         if (delegated_inode) {
    4106           0 :                 error = break_deleg_wait(&delegated_inode);
    4107           0 :                 if (!error)
    4108           0 :                         goto retry_deleg;
    4109             :         }
    4110         878 :         mnt_drop_write(path.mnt);
    4111         879 : exit1:
    4112         879 :         path_put(&path);
    4113        1758 :         if (retry_estale(error, lookup_flags)) {
    4114           0 :                 lookup_flags |= LOOKUP_REVAL;
    4115           0 :                 inode = NULL;
    4116           0 :                 goto retry;
    4117             :         }
    4118         879 :         putname(name);
    4119         879 :         return error;
    4120             : 
    4121          83 : slashes:
    4122          83 :         if (d_is_negative(dentry))
    4123             :                 error = -ENOENT;
    4124           0 :         else if (d_is_dir(dentry))
    4125             :                 error = -EISDIR;
    4126             :         else
    4127           0 :                 error = -ENOTDIR;
    4128          83 :         goto exit2;
    4129             : }
    4130             : 
    4131         114 : SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
    4132             : {
    4133          57 :         if ((flag & ~AT_REMOVEDIR) != 0)
    4134             :                 return -EINVAL;
    4135             : 
    4136          57 :         if (flag & AT_REMOVEDIR)
    4137          22 :                 return do_rmdir(dfd, getname(pathname));
    4138          35 :         return do_unlinkat(dfd, getname(pathname));
    4139             : }
    4140             : 
    4141        1760 : SYSCALL_DEFINE1(unlink, const char __user *, pathname)
    4142             : {
    4143         880 :         return do_unlinkat(AT_FDCWD, getname(pathname));
    4144             : }
    4145             : 
    4146             : /**
    4147             :  * vfs_symlink - create symlink
    4148             :  * @mnt_userns: user namespace of the mount the inode was found from
    4149             :  * @dir:        inode of @dentry
    4150             :  * @dentry:     pointer to dentry of the base directory
    4151             :  * @oldname:    name of the file to link to
    4152             :  *
    4153             :  * Create a symlink.
    4154             :  *
    4155             :  * If the inode has been found through an idmapped mount the user namespace of
    4156             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    4157             :  * care to map the inode according to @mnt_userns before checking permissions.
    4158             :  * On non-idmapped mounts or if permission checking is to be performed on the
    4159             :  * raw inode simply passs init_user_ns.
    4160             :  */
    4161         209 : int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
    4162             :                 struct dentry *dentry, const char *oldname)
    4163             : {
    4164         209 :         int error = may_create(mnt_userns, dir, dentry);
    4165             : 
    4166         209 :         if (error)
    4167             :                 return error;
    4168             : 
    4169         209 :         if (!dir->i_op->symlink)
    4170             :                 return -EPERM;
    4171             : 
    4172         209 :         error = security_inode_symlink(dir, dentry, oldname);
    4173         209 :         if (error)
    4174             :                 return error;
    4175             : 
    4176         209 :         error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
    4177         209 :         if (!error)
    4178         209 :                 fsnotify_create(dir, dentry);
    4179             :         return error;
    4180             : }
    4181             : EXPORT_SYMBOL(vfs_symlink);
    4182             : 
    4183         220 : static long do_symlinkat(const char __user *oldname, int newdfd,
    4184             :                   const char __user *newname)
    4185             : {
    4186         220 :         int error;
    4187         220 :         struct filename *from;
    4188         220 :         struct dentry *dentry;
    4189         220 :         struct path path;
    4190         220 :         unsigned int lookup_flags = 0;
    4191             : 
    4192         220 :         from = getname(oldname);
    4193         220 :         if (IS_ERR(from))
    4194           0 :                 return PTR_ERR(from);
    4195         220 : retry:
    4196         220 :         dentry = user_path_create(newdfd, newname, &path, lookup_flags);
    4197         220 :         error = PTR_ERR(dentry);
    4198         220 :         if (IS_ERR(dentry))
    4199          10 :                 goto out_putname;
    4200             : 
    4201         210 :         error = security_path_symlink(&path, dentry, from->name);
    4202         210 :         if (!error) {
    4203         209 :                 struct user_namespace *mnt_userns;
    4204             : 
    4205         209 :                 mnt_userns = mnt_user_ns(path.mnt);
    4206         209 :                 error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
    4207             :                                     from->name);
    4208             :         }
    4209         210 :         done_path_create(&path, dentry);
    4210         420 :         if (retry_estale(error, lookup_flags)) {
    4211           0 :                 lookup_flags |= LOOKUP_REVAL;
    4212           0 :                 goto retry;
    4213             :         }
    4214         210 : out_putname:
    4215         220 :         putname(from);
    4216         220 :         return error;
    4217             : }
    4218             : 
    4219           0 : SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
    4220             :                 int, newdfd, const char __user *, newname)
    4221             : {
    4222           0 :         return do_symlinkat(oldname, newdfd, newname);
    4223             : }
    4224             : 
    4225         440 : SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
    4226             : {
    4227         220 :         return do_symlinkat(oldname, AT_FDCWD, newname);
    4228             : }
    4229             : 
    4230             : /**
    4231             :  * vfs_link - create a new link
    4232             :  * @old_dentry: object to be linked
    4233             :  * @mnt_userns: the user namespace of the mount
    4234             :  * @dir:        new parent
    4235             :  * @new_dentry: where to create the new link
    4236             :  * @delegated_inode: returns inode needing a delegation break
    4237             :  *
    4238             :  * The caller must hold dir->i_mutex
    4239             :  *
    4240             :  * If vfs_link discovers a delegation on the to-be-linked file in need
    4241             :  * of breaking, it will return -EWOULDBLOCK and return a reference to the
    4242             :  * inode in delegated_inode.  The caller should then break the delegation
    4243             :  * and retry.  Because breaking a delegation may take a long time, the
    4244             :  * caller should drop the i_mutex before doing so.
    4245             :  *
    4246             :  * Alternatively, a caller may pass NULL for delegated_inode.  This may
    4247             :  * be appropriate for callers that expect the underlying filesystem not
    4248             :  * to be NFS exported.
    4249             :  *
    4250             :  * If the inode has been found through an idmapped mount the user namespace of
    4251             :  * the vfsmount must be passed through @mnt_userns. This function will then take
    4252             :  * care to map the inode according to @mnt_userns before checking permissions.
    4253             :  * On non-idmapped mounts or if permission checking is to be performed on the
    4254             :  * raw inode simply passs init_user_ns.
    4255             :  */
    4256          19 : int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
    4257             :              struct inode *dir, struct dentry *new_dentry,
    4258             :              struct inode **delegated_inode)
    4259             : {
    4260          19 :         struct inode *inode = old_dentry->d_inode;
    4261          19 :         unsigned max_links = dir->i_sb->s_max_links;
    4262          19 :         int error;
    4263             : 
    4264          19 :         if (!inode)
    4265             :                 return -ENOENT;
    4266             : 
    4267          19 :         error = may_create(mnt_userns, dir, new_dentry);
    4268          19 :         if (error)
    4269             :                 return error;
    4270             : 
    4271          19 :         if (dir->i_sb != inode->i_sb)
    4272             :                 return -EXDEV;
    4273             : 
    4274             :         /*
    4275             :          * A link to an append-only or immutable file cannot be created.
    4276             :          */
    4277          19 :         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
    4278             :                 return -EPERM;
    4279             :         /*
    4280             :          * Updating the link count will likely cause i_uid and i_gid to
    4281             :          * be writen back improperly if their true value is unknown to
    4282             :          * the vfs.
    4283             :          */
    4284          38 :         if (HAS_UNMAPPED_ID(mnt_userns, inode))
    4285             :                 return -EPERM;
    4286          19 :         if (!dir->i_op->link)
    4287             :                 return -EPERM;
    4288          19 :         if (S_ISDIR(inode->i_mode))
    4289             :                 return -EPERM;
    4290             : 
    4291          19 :         error = security_inode_link(old_dentry, dir, new_dentry);
    4292          19 :         if (error)
    4293             :                 return error;
    4294             : 
    4295          19 :         inode_lock(inode);
    4296             :         /* Make sure we don't allow creating hardlink to an unlinked file */
    4297          19 :         if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
    4298             :                 error =  -ENOENT;
    4299          19 :         else if (max_links && inode->i_nlink >= max_links)
    4300             :                 error = -EMLINK;
    4301             :         else {
    4302          19 :                 error = try_break_deleg(inode, delegated_inode);
    4303          19 :                 if (!error)
    4304          19 :                         error = dir->i_op->link(old_dentry, dir, new_dentry);
    4305             :         }
    4306             : 
    4307          19 :         if (!error && (inode->i_state & I_LINKABLE)) {
    4308           3 :                 spin_lock(&inode->i_lock);
    4309           3 :                 inode->i_state &= ~I_LINKABLE;
    4310           3 :                 spin_unlock(&inode->i_lock);
    4311             :         }
    4312          19 :         inode_unlock(inode);
    4313          19 :         if (!error)
    4314          19 :                 fsnotify_link(dir, inode, new_dentry);
    4315             :         return error;
    4316             : }
    4317             : EXPORT_SYMBOL(vfs_link);
    4318             : 
    4319             : /*
    4320             :  * Hardlinks are often used in delicate situations.  We avoid
    4321             :  * security-related surprises by not following symlinks on the
    4322             :  * newname.  --KAB
    4323             :  *
    4324             :  * We don't follow them on the oldname either to be compatible
    4325             :  * with linux 2.0, and to avoid hard-linking to directories
    4326             :  * and other special files.  --ADM
    4327             :  */
    4328          26 : static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
    4329             :               const char __user *newname, int flags)
    4330             : {
    4331          26 :         struct user_namespace *mnt_userns;
    4332          26 :         struct dentry *new_dentry;
    4333          26 :         struct path old_path, new_path;
    4334          26 :         struct inode *delegated_inode = NULL;
    4335          26 :         int how = 0;
    4336          26 :         int error;
    4337             : 
    4338          26 :         if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
    4339             :                 return -EINVAL;
    4340             :         /*
    4341             :          * To use null names we require CAP_DAC_READ_SEARCH
    4342             :          * This ensures that not everyone will be able to create
    4343             :          * handlink using the passed filedescriptor.
    4344             :          */
    4345          26 :         if (flags & AT_EMPTY_PATH) {
    4346           0 :                 if (!capable(CAP_DAC_READ_SEARCH))
    4347             :                         return -ENOENT;
    4348             :                 how = LOOKUP_EMPTY;
    4349             :         }
    4350             : 
    4351          26 :         if (flags & AT_SYMLINK_FOLLOW)
    4352           0 :                 how |= LOOKUP_FOLLOW;
    4353          26 : retry:
    4354          26 :         error = user_path_at(olddfd, oldname, how, &old_path);
    4355          26 :         if (error)
    4356           0 :                 return error;
    4357             : 
    4358          26 :         new_dentry = user_path_create(newdfd, newname, &new_path,
    4359             :                                         (how & LOOKUP_REVAL));
    4360          26 :         error = PTR_ERR(new_dentry);
    4361          26 :         if (IS_ERR(new_dentry))
    4362           0 :                 goto out;
    4363             : 
    4364          26 :         error = -EXDEV;
    4365          26 :         if (old_path.mnt != new_path.mnt)
    4366           0 :                 goto out_dput;
    4367          26 :         mnt_userns = mnt_user_ns(new_path.mnt);
    4368          26 :         error = may_linkat(mnt_userns, &old_path);
    4369          26 :         if (unlikely(error))
    4370           0 :                 goto out_dput;
    4371          26 :         error = security_path_link(old_path.dentry, &new_path, new_dentry);
    4372          26 :         if (error)
    4373          10 :                 goto out_dput;
    4374          16 :         error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
    4375             :                          new_dentry, &delegated_inode);
    4376          26 : out_dput:
    4377          26 :         done_path_create(&new_path, new_dentry);
    4378          26 :         if (delegated_inode) {
    4379           0 :                 error = break_deleg_wait(&delegated_inode);
    4380           0 :                 if (!error) {
    4381           0 :                         path_put(&old_path);
    4382           0 :                         goto retry;
    4383             :                 }
    4384             :         }
    4385          52 :         if (retry_estale(error, how)) {
    4386           0 :                 path_put(&old_path);
    4387           0 :                 how |= LOOKUP_REVAL;
    4388           0 :                 goto retry;
    4389             :         }
    4390          26 : out:
    4391          26 :         path_put(&old_path);
    4392             : 
    4393          26 :         return error;
    4394             : }
    4395             : 
    4396           0 : SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
    4397             :                 int, newdfd, const char __user *, newname, int, flags)
    4398             : {
    4399           0 :         return do_linkat(olddfd, oldname, newdfd, newname, flags);
    4400             : }
    4401             : 
    4402          52 : SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
    4403             : {
    4404          26 :         return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
    4405             : }
    4406             : 
    4407             : /**
    4408             :  * vfs_rename - rename a filesystem object
    4409             :  * @old_mnt_userns:     old user namespace of the mount the inode was found from
    4410             :  * @old_dir:            parent of source
    4411             :  * @old_dentry:         source
    4412             :  * @new_mnt_userns:     new user namespace of the mount the inode was found from
    4413             :  * @new_dir:            parent of destination
    4414             :  * @new_dentry:         destination
    4415             :  * @delegated_inode:    returns an inode needing a delegation break
    4416             :  * @flags:              rename flags
    4417             :  *
    4418             :  * The caller must hold multiple mutexes--see lock_rename()).
    4419             :  *
    4420             :  * If vfs_rename discovers a delegation in need of breaking at either
    4421             :  * the source or destination, it will return -EWOULDBLOCK and return a
    4422             :  * reference to the inode in delegated_inode.  The caller should then
    4423             :  * break the delegation and retry.  Because breaking a delegation may
    4424             :  * take a long time, the caller should drop all locks before doing
    4425             :  * so.
    4426             :  *
    4427             :  * Alternatively, a caller may pass NULL for delegated_inode.  This may
    4428             :  * be appropriate for callers that expect the underlying filesystem not
    4429             :  * to be NFS exported.
    4430             :  *
    4431             :  * The worst of all namespace operations - renaming directory. "Perverted"
    4432             :  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
    4433             :  * Problems:
    4434             :  *
    4435             :  *      a) we can get into loop creation.
    4436             :  *      b) race potential - two innocent renames can create a loop together.
    4437             :  *         That's where 4.4 screws up. Current fix: serialization on
    4438             :  *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
    4439             :  *         story.
    4440             :  *      c) we have to lock _four_ objects - parents and victim (if it exists),
    4441             :  *         and source (if it is not a directory).
    4442             :  *         And that - after we got ->i_mutex on parents (until then we don't know
    4443             :  *         whether the target exists).  Solution: try to be smart with locking
    4444             :  *         order for inodes.  We rely on the fact that tree topology may change
    4445             :  *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
    4446             :  *         move will be locked.  Thus we can rank directories by the tree
    4447             :  *         (ancestors first) and rank all non-directories after them.
    4448             :  *         That works since everybody except rename does "lock parent, lookup,
    4449             :  *         lock child" and rename is under ->s_vfs_rename_mutex.
    4450             :  *         HOWEVER, it relies on the assumption that any object with ->lookup()
    4451             :  *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
    4452             :  *         we'd better make sure that there's no link(2) for them.
    4453             :  *      d) conversion from fhandle to dentry may come in the wrong moment - when
    4454             :  *         we are removing the target. Solution: we will have to grab ->i_mutex
    4455             :  *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
    4456             :  *         ->i_mutex on parents, which works but leads to some truly excessive
    4457             :  *         locking].
    4458             :  */
    4459         392 : int vfs_rename(struct renamedata *rd)
    4460             : {
    4461         392 :         int error;
    4462         392 :         struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
    4463         392 :         struct dentry *old_dentry = rd->old_dentry;
    4464         392 :         struct dentry *new_dentry = rd->new_dentry;
    4465         392 :         struct inode **delegated_inode = rd->delegated_inode;
    4466         392 :         unsigned int flags = rd->flags;
    4467         392 :         bool is_dir = d_is_dir(old_dentry);
    4468         392 :         struct inode *source = old_dentry->d_inode;
    4469         392 :         struct inode *target = new_dentry->d_inode;
    4470         392 :         bool new_is_dir = false;
    4471         392 :         unsigned max_links = new_dir->i_sb->s_max_links;
    4472         392 :         struct name_snapshot old_name;
    4473             : 
    4474         392 :         if (source == target)
    4475             :                 return 0;
    4476             : 
    4477         392 :         error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
    4478         392 :         if (error)
    4479             :                 return error;
    4480             : 
    4481         392 :         if (!target) {
    4482         297 :                 error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
    4483             :         } else {
    4484          95 :                 new_is_dir = d_is_dir(new_dentry);
    4485             : 
    4486          95 :                 if (!(flags & RENAME_EXCHANGE))
    4487          91 :                         error = may_delete(rd->new_mnt_userns, new_dir,
    4488             :                                            new_dentry, is_dir);
    4489             :                 else
    4490           4 :                         error = may_delete(rd->new_mnt_userns, new_dir,
    4491             :                                            new_dentry, new_is_dir);
    4492             :         }
    4493         392 :         if (error)
    4494             :                 return error;
    4495             : 
    4496         392 :         if (!old_dir->i_op->rename)
    4497             :                 return -EPERM;
    4498             : 
    4499             :         /*
    4500             :          * If we are going to change the parent - check write permissions,
    4501             :          * we'll need to flip '..'.
    4502             :          */
    4503         392 :         if (new_dir != old_dir) {
    4504           1 :                 if (is_dir) {
    4505           1 :                         error = inode_permission(rd->old_mnt_userns, source,
    4506             :                                                  MAY_WRITE);
    4507           1 :                         if (error)
    4508             :                                 return error;
    4509             :                 }
    4510           1 :                 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
    4511           0 :                         error = inode_permission(rd->new_mnt_userns, target,
    4512             :                                                  MAY_WRITE);
    4513           0 :                         if (error)
    4514             :                                 return error;
    4515             :                 }
    4516             :         }
    4517             : 
    4518         392 :         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
    4519             :                                       flags);
    4520         392 :         if (error)
    4521             :                 return error;
    4522             : 
    4523         392 :         take_dentry_name_snapshot(&old_name, old_dentry);
    4524         392 :         dget(new_dentry);
    4525         392 :         if (!is_dir || (flags & RENAME_EXCHANGE))
    4526         390 :                 lock_two_nondirectories(source, target);
    4527           2 :         else if (target)
    4528           1 :                 inode_lock(target);
    4529             : 
    4530         392 :         error = -EBUSY;
    4531         392 :         if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
    4532           0 :                 goto out;
    4533             : 
    4534         392 :         if (max_links && new_dir != old_dir) {
    4535           0 :                 error = -EMLINK;
    4536           0 :                 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
    4537           0 :                         goto out;
    4538           0 :                 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
    4539           0 :                     old_dir->i_nlink >= max_links)
    4540           0 :                         goto out;
    4541             :         }
    4542         392 :         if (!is_dir) {
    4543         388 :                 error = try_break_deleg(source, delegated_inode);
    4544         388 :                 if (error)
    4545           0 :                         goto out;
    4546             :         }
    4547         392 :         if (target && !new_is_dir) {
    4548          93 :                 error = try_break_deleg(target, delegated_inode);
    4549          93 :                 if (error)
    4550           0 :                         goto out;
    4551             :         }
    4552         392 :         error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
    4553             :                                       new_dir, new_dentry, flags);
    4554         392 :         if (error)
    4555           0 :                 goto out;
    4556             : 
    4557         392 :         if (!(flags & RENAME_EXCHANGE) && target) {
    4558          91 :                 if (is_dir) {
    4559           1 :                         shrink_dcache_parent(new_dentry);
    4560           1 :                         target->i_flags |= S_DEAD;
    4561             :                 }
    4562          91 :                 dont_mount(new_dentry);
    4563          91 :                 detach_mounts(new_dentry);
    4564             :         }
    4565         392 :         if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
    4566         392 :                 if (!(flags & RENAME_EXCHANGE))
    4567         388 :                         d_move(old_dentry, new_dentry);
    4568             :                 else
    4569           4 :                         d_exchange(old_dentry, new_dentry);
    4570             :         }
    4571           0 : out:
    4572         392 :         if (!is_dir || (flags & RENAME_EXCHANGE))
    4573         390 :                 unlock_two_nondirectories(source, target);
    4574           2 :         else if (target)
    4575           1 :                 inode_unlock(target);
    4576         392 :         dput(new_dentry);
    4577         392 :         if (!error) {
    4578         392 :                 fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
    4579         392 :                               !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
    4580         392 :                 if (flags & RENAME_EXCHANGE) {
    4581           4 :                         fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
    4582             :                                       new_is_dir, NULL, new_dentry);
    4583             :                 }
    4584             :         }
    4585         392 :         release_dentry_name_snapshot(&old_name);
    4586             : 
    4587         392 :         return error;
    4588             : }
    4589             : EXPORT_SYMBOL(vfs_rename);
    4590             : 
    4591         425 : int do_renameat2(int olddfd, struct filename *from, int newdfd,
    4592             :                  struct filename *to, unsigned int flags)
    4593             : {
    4594         425 :         struct renamedata rd;
    4595         425 :         struct dentry *old_dentry, *new_dentry;
    4596         425 :         struct dentry *trap;
    4597         425 :         struct path old_path, new_path;
    4598         425 :         struct qstr old_last, new_last;
    4599         425 :         int old_type, new_type;
    4600         425 :         struct inode *delegated_inode = NULL;
    4601         425 :         unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
    4602         425 :         bool should_retry = false;
    4603         425 :         int error = -EINVAL;
    4604             : 
    4605         425 :         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
    4606           0 :                 goto put_both;
    4607             : 
    4608         425 :         if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
    4609           9 :             (flags & RENAME_EXCHANGE))
    4610           0 :                 goto put_both;
    4611             : 
    4612         425 :         if (flags & RENAME_EXCHANGE)
    4613          14 :                 target_flags = 0;
    4614             : 
    4615         425 : retry:
    4616         425 :         from = filename_parentat(olddfd, from, lookup_flags, &old_path,
    4617             :                                         &old_last, &old_type);
    4618         425 :         if (IS_ERR(from)) {
    4619           0 :                 error = PTR_ERR(from);
    4620           0 :                 goto put_new;
    4621             :         }
    4622             : 
    4623         425 :         to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
    4624             :                                 &new_type);
    4625         425 :         if (IS_ERR(to)) {
    4626           0 :                 error = PTR_ERR(to);
    4627           0 :                 goto exit1;
    4628             :         }
    4629             : 
    4630         425 :         error = -EXDEV;
    4631         425 :         if (old_path.mnt != new_path.mnt)
    4632           0 :                 goto exit2;
    4633             : 
    4634         425 :         error = -EBUSY;
    4635         425 :         if (old_type != LAST_NORM)
    4636           0 :                 goto exit2;
    4637             : 
    4638         425 :         if (flags & RENAME_NOREPLACE)
    4639             :                 error = -EEXIST;
    4640         425 :         if (new_type != LAST_NORM)
    4641           0 :                 goto exit2;
    4642             : 
    4643         425 :         error = mnt_want_write(old_path.mnt);
    4644         425 :         if (error)
    4645           0 :                 goto exit2;
    4646             : 
    4647         425 : retry_deleg:
    4648         425 :         trap = lock_rename(new_path.dentry, old_path.dentry);
    4649             : 
    4650         425 :         old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
    4651         425 :         error = PTR_ERR(old_dentry);
    4652         425 :         if (IS_ERR(old_dentry))
    4653           0 :                 goto exit3;
    4654             :         /* source must exist */
    4655         425 :         error = -ENOENT;
    4656         425 :         if (d_is_negative(old_dentry))
    4657          12 :                 goto exit4;
    4658         413 :         new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
    4659         413 :         error = PTR_ERR(new_dentry);
    4660         413 :         if (IS_ERR(new_dentry))
    4661           0 :                 goto exit4;
    4662         413 :         error = -EEXIST;
    4663         413 :         if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
    4664           2 :                 goto exit5;
    4665         411 :         if (flags & RENAME_EXCHANGE) {
    4666          14 :                 error = -ENOENT;
    4667          14 :                 if (d_is_negative(new_dentry))
    4668           0 :                         goto exit5;
    4669             : 
    4670          21 :                 if (!d_is_dir(new_dentry)) {
    4671           7 :                         error = -ENOTDIR;
    4672           7 :                         if (new_last.name[new_last.len])
    4673           0 :                                 goto exit5;
    4674             :                 }
    4675             :         }
    4676             :         /* unless the source is a directory trailing slashes give -ENOTDIR */
    4677         814 :         if (!d_is_dir(old_dentry)) {
    4678         403 :                 error = -ENOTDIR;
    4679         403 :                 if (old_last.name[old_last.len])
    4680           0 :                         goto exit5;
    4681         403 :                 if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
    4682           0 :                         goto exit5;
    4683             :         }
    4684             :         /* source should not be ancestor of target */
    4685         411 :         error = -EINVAL;
    4686         411 :         if (old_dentry == trap)
    4687           0 :                 goto exit5;
    4688             :         /* target should not be an ancestor of source */
    4689         411 :         if (!(flags & RENAME_EXCHANGE))
    4690         397 :                 error = -ENOTEMPTY;
    4691         411 :         if (new_dentry == trap)
    4692           0 :                 goto exit5;
    4693             : 
    4694         411 :         error = security_path_rename(&old_path, old_dentry,
    4695             :                                      &new_path, new_dentry, flags);
    4696         411 :         if (error)
    4697          22 :                 goto exit5;
    4698             : 
    4699         389 :         rd.old_dir         = old_path.dentry->d_inode;
    4700         389 :         rd.old_dentry      = old_dentry;
    4701         389 :         rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
    4702         389 :         rd.new_dir         = new_path.dentry->d_inode;
    4703         389 :         rd.new_dentry      = new_dentry;
    4704         389 :         rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
    4705         389 :         rd.delegated_inode = &delegated_inode;
    4706         389 :         rd.flags           = flags;
    4707         389 :         error = vfs_rename(&rd);
    4708         413 : exit5:
    4709         413 :         dput(new_dentry);
    4710         425 : exit4:
    4711         425 :         dput(old_dentry);
    4712         425 : exit3:
    4713         425 :         unlock_rename(new_path.dentry, old_path.dentry);
    4714         425 :         if (delegated_inode) {
    4715           0 :                 error = break_deleg_wait(&delegated_inode);
    4716           0 :                 if (!error)
    4717           0 :                         goto retry_deleg;
    4718             :         }
    4719         425 :         mnt_drop_write(old_path.mnt);
    4720         425 : exit2:
    4721         425 :         if (retry_estale(error, lookup_flags))
    4722           0 :                 should_retry = true;
    4723         425 :         path_put(&new_path);
    4724         425 : exit1:
    4725         425 :         path_put(&old_path);
    4726         425 :         if (should_retry) {
    4727           0 :                 should_retry = false;
    4728           0 :                 lookup_flags |= LOOKUP_REVAL;
    4729           0 :                 goto retry;
    4730             :         }
    4731         425 : put_both:
    4732         425 :         if (!IS_ERR(from))
    4733         425 :                 putname(from);
    4734           0 : put_new:
    4735         425 :         if (!IS_ERR(to))
    4736         425 :                 putname(to);
    4737         425 :         return error;
    4738             : }
    4739             : 
    4740          46 : SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
    4741             :                 int, newdfd, const char __user *, newname, unsigned int, flags)
    4742             : {
    4743          23 :         return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
    4744             :                                 flags);
    4745             : }
    4746             : 
    4747           0 : SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
    4748             :                 int, newdfd, const char __user *, newname)
    4749             : {
    4750           0 :         return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
    4751             :                                 0);
    4752             : }
    4753             : 
    4754         804 : SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
    4755             : {
    4756         402 :         return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
    4757             :                                 getname(newname), 0);
    4758             : }
    4759             : 
    4760        1106 : int readlink_copy(char __user *buffer, int buflen, const char *link)
    4761             : {
    4762        1106 :         int len = PTR_ERR(link);
    4763        1106 :         if (IS_ERR(link))
    4764           0 :                 goto out;
    4765             : 
    4766        1106 :         len = strlen(link);
    4767        1106 :         if (len > (unsigned) buflen)
    4768           0 :                 len = buflen;
    4769        2212 :         if (copy_to_user(buffer, link, len))
    4770           0 :                 len = -EFAULT;
    4771        1106 : out:
    4772        1106 :         return len;
    4773             : }
    4774             : 
    4775             : /**
    4776             :  * vfs_readlink - copy symlink body into userspace buffer
    4777             :  * @dentry: dentry on which to get symbolic link
    4778             :  * @buffer: user memory pointer
    4779             :  * @buflen: size of buffer
    4780             :  *
    4781             :  * Does not touch atime.  That's up to the caller if necessary
    4782             :  *
    4783             :  * Does not call security hook.
    4784             :  */
    4785        1156 : int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
    4786             : {
    4787        1156 :         struct inode *inode = d_inode(dentry);
    4788        1156 :         DEFINE_DELAYED_CALL(done);
    4789        1156 :         const char *link;
    4790        1156 :         int res;
    4791             : 
    4792        1156 :         if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
    4793         518 :                 if (unlikely(inode->i_op->readlink))
    4794          50 :                         return inode->i_op->readlink(dentry, buffer, buflen);
    4795             : 
    4796         468 :                 if (!d_is_symlink(dentry))
    4797             :                         return -EINVAL;
    4798             : 
    4799         468 :                 spin_lock(&inode->i_lock);
    4800         468 :                 inode->i_opflags |= IOP_DEFAULT_READLINK;
    4801         468 :                 spin_unlock(&inode->i_lock);
    4802             :         }
    4803             : 
    4804        1106 :         link = READ_ONCE(inode->i_link);
    4805        1106 :         if (!link) {
    4806         860 :                 link = inode->i_op->get_link(dentry, inode, &done);
    4807         861 :                 if (IS_ERR(link))
    4808           0 :                         return PTR_ERR(link);
    4809             :         }
    4810        1107 :         res = readlink_copy(buffer, buflen, link);
    4811        1106 :         do_delayed_call(&done);
    4812             :         return res;
    4813             : }
    4814             : EXPORT_SYMBOL(vfs_readlink);
    4815             : 
    4816             : /**
    4817             :  * vfs_get_link - get symlink body
    4818             :  * @dentry: dentry on which to get symbolic link
    4819             :  * @done: caller needs to free returned data with this
    4820             :  *
    4821             :  * Calls security hook and i_op->get_link() on the supplied inode.
    4822             :  *
    4823             :  * It does not touch atime.  That's up to the caller if necessary.
    4824             :  *
    4825             :  * Does not work on "special" symlinks like /proc/$$/fd/N
    4826             :  */
    4827           0 : const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
    4828             : {
    4829           0 :         const char *res = ERR_PTR(-EINVAL);
    4830           0 :         struct inode *inode = d_inode(dentry);
    4831             : 
    4832           0 :         if (d_is_symlink(dentry)) {
    4833           0 :                 res = ERR_PTR(security_inode_readlink(dentry));
    4834           0 :                 if (!res)
    4835           0 :                         res = inode->i_op->get_link(dentry, inode, done);
    4836             :         }
    4837           0 :         return res;
    4838             : }
    4839             : EXPORT_SYMBOL(vfs_get_link);
    4840             : 
    4841             : /* get the link contents into pagecache */
    4842           0 : const char *page_get_link(struct dentry *dentry, struct inode *inode,
    4843             :                           struct delayed_call *callback)
    4844             : {
    4845           0 :         char *kaddr;
    4846           0 :         struct page *page;
    4847           0 :         struct address_space *mapping = inode->i_mapping;
    4848             : 
    4849           0 :         if (!dentry) {
    4850           0 :                 page = find_get_page(mapping, 0);
    4851           0 :                 if (!page)
    4852           0 :                         return ERR_PTR(-ECHILD);
    4853           0 :                 if (!PageUptodate(page)) {
    4854           0 :                         put_page(page);
    4855           0 :                         return ERR_PTR(-ECHILD);
    4856             :                 }
    4857             :         } else {
    4858           0 :                 page = read_mapping_page(mapping, 0, NULL);
    4859           0 :                 if (IS_ERR(page))
    4860             :                         return (char*)page;
    4861             :         }
    4862           0 :         set_delayed_call(callback, page_put_link, page);
    4863           0 :         BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
    4864           0 :         kaddr = page_address(page);
    4865           0 :         nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
    4866           0 :         return kaddr;
    4867             : }
    4868             : 
    4869             : EXPORT_SYMBOL(page_get_link);
    4870             : 
    4871           0 : void page_put_link(void *arg)
    4872             : {
    4873           0 :         put_page(arg);
    4874           0 : }
    4875             : EXPORT_SYMBOL(page_put_link);
    4876             : 
    4877           0 : int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
    4878             : {
    4879           0 :         DEFINE_DELAYED_CALL(done);
    4880           0 :         int res = readlink_copy(buffer, buflen,
    4881             :                                 page_get_link(dentry, d_inode(dentry),
    4882             :                                               &done));
    4883           0 :         do_delayed_call(&done);
    4884           0 :         return res;
    4885             : }
    4886             : EXPORT_SYMBOL(page_readlink);
    4887             : 
    4888             : /*
    4889             :  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
    4890             :  */
    4891           0 : int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
    4892             : {
    4893           0 :         struct address_space *mapping = inode->i_mapping;
    4894           0 :         struct page *page;
    4895           0 :         void *fsdata;
    4896           0 :         int err;
    4897           0 :         unsigned int flags = 0;
    4898           0 :         if (nofs)
    4899           0 :                 flags |= AOP_FLAG_NOFS;
    4900             : 
    4901           0 : retry:
    4902           0 :         err = pagecache_write_begin(NULL, mapping, 0, len-1,
    4903             :                                 flags, &page, &fsdata);
    4904           0 :         if (err)
    4905           0 :                 goto fail;
    4906             : 
    4907           0 :         memcpy(page_address(page), symname, len-1);
    4908             : 
    4909           0 :         err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
    4910             :                                                         page, fsdata);
    4911           0 :         if (err < 0)
    4912           0 :                 goto fail;
    4913           0 :         if (err < len-1)
    4914           0 :                 goto retry;
    4915             : 
    4916           0 :         mark_inode_dirty(inode);
    4917           0 :         return 0;
    4918             : fail:
    4919             :         return err;
    4920             : }
    4921             : EXPORT_SYMBOL(__page_symlink);
    4922             : 
    4923           0 : int page_symlink(struct inode *inode, const char *symname, int len)
    4924             : {
    4925           0 :         return __page_symlink(inode, symname, len,
    4926           0 :                         !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
    4927             : }
    4928             : EXPORT_SYMBOL(page_symlink);
    4929             : 
    4930             : const struct inode_operations page_symlink_inode_operations = {
    4931             :         .get_link       = page_get_link,
    4932             : };
    4933             : EXPORT_SYMBOL(page_symlink_inode_operations);

Generated by: LCOV version 1.14