Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/namespace.c
4 : *
5 : * (C) Copyright Al Viro 2000, 2001
6 : *
7 : * Based on code from fs/super.c, copyright Linus Torvalds and others.
8 : * Heavily rewritten.
9 : */
10 :
11 : #include <linux/syscalls.h>
12 : #include <linux/export.h>
13 : #include <linux/capability.h>
14 : #include <linux/mnt_namespace.h>
15 : #include <linux/user_namespace.h>
16 : #include <linux/namei.h>
17 : #include <linux/security.h>
18 : #include <linux/cred.h>
19 : #include <linux/idr.h>
20 : #include <linux/init.h> /* init_rootfs */
21 : #include <linux/fs_struct.h> /* get_fs_root et.al. */
22 : #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23 : #include <linux/file.h>
24 : #include <linux/uaccess.h>
25 : #include <linux/proc_ns.h>
26 : #include <linux/magic.h>
27 : #include <linux/memblock.h>
28 : #include <linux/proc_fs.h>
29 : #include <linux/task_work.h>
30 : #include <linux/sched/task.h>
31 : #include <uapi/linux/mount.h>
32 : #include <linux/fs_context.h>
33 : #include <linux/shmem_fs.h>
34 :
35 : #include "pnode.h"
36 : #include "internal.h"
37 :
38 : /* Maximum number of mounts in a mount namespace */
39 : unsigned int sysctl_mount_max __read_mostly = 100000;
40 :
41 : static unsigned int m_hash_mask __read_mostly;
42 : static unsigned int m_hash_shift __read_mostly;
43 : static unsigned int mp_hash_mask __read_mostly;
44 : static unsigned int mp_hash_shift __read_mostly;
45 :
46 : static __initdata unsigned long mhash_entries;
47 0 : static int __init set_mhash_entries(char *str)
48 : {
49 0 : if (!str)
50 : return 0;
51 0 : mhash_entries = simple_strtoul(str, &str, 0);
52 0 : return 1;
53 : }
54 : __setup("mhash_entries=", set_mhash_entries);
55 :
56 : static __initdata unsigned long mphash_entries;
57 0 : static int __init set_mphash_entries(char *str)
58 : {
59 0 : if (!str)
60 : return 0;
61 0 : mphash_entries = simple_strtoul(str, &str, 0);
62 0 : return 1;
63 : }
64 : __setup("mphash_entries=", set_mphash_entries);
65 :
66 : static u64 event;
67 : static DEFINE_IDA(mnt_id_ida);
68 : static DEFINE_IDA(mnt_group_ida);
69 :
70 : static struct hlist_head *mount_hashtable __read_mostly;
71 : static struct hlist_head *mountpoint_hashtable __read_mostly;
72 : static struct kmem_cache *mnt_cache __read_mostly;
73 : static DECLARE_RWSEM(namespace_sem);
74 : static HLIST_HEAD(unmounted); /* protected by namespace_sem */
75 : static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
76 :
77 : struct mount_kattr {
78 : unsigned int attr_set;
79 : unsigned int attr_clr;
80 : unsigned int propagation;
81 : unsigned int lookup_flags;
82 : bool recurse;
83 : struct user_namespace *mnt_userns;
84 : };
85 :
86 : /* /sys/fs */
87 : struct kobject *fs_kobj;
88 : EXPORT_SYMBOL_GPL(fs_kobj);
89 :
90 : /*
91 : * vfsmount lock may be taken for read to prevent changes to the
92 : * vfsmount hash, ie. during mountpoint lookups or walking back
93 : * up the tree.
94 : *
95 : * It should be taken for write in all cases where the vfsmount
96 : * tree or hash is modified or when a vfsmount structure is modified.
97 : */
98 : __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
99 :
100 3499 : static inline void lock_mount_hash(void)
101 : {
102 3499 : write_seqlock(&mount_lock);
103 110 : }
104 :
105 3499 : static inline void unlock_mount_hash(void)
106 : {
107 3499 : write_sequnlock(&mount_lock);
108 3499 : }
109 :
110 36411 : static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
111 : {
112 36411 : unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
113 36411 : tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
114 36411 : tmp = tmp + (tmp >> m_hash_shift);
115 36411 : return &mount_hashtable[tmp & m_hash_mask];
116 : }
117 :
118 320 : static inline struct hlist_head *mp_hash(struct dentry *dentry)
119 : {
120 320 : unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
121 320 : tmp = tmp + (tmp >> mp_hash_shift);
122 320 : return &mountpoint_hashtable[tmp & mp_hash_mask];
123 : }
124 :
125 1171 : static int mnt_alloc_id(struct mount *mnt)
126 : {
127 1171 : int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
128 :
129 1171 : if (res < 0)
130 : return res;
131 1171 : mnt->mnt_id = res;
132 1171 : return 0;
133 : }
134 :
135 1069 : static void mnt_free_id(struct mount *mnt)
136 : {
137 1069 : ida_free(&mnt_id_ida, mnt->mnt_id);
138 0 : }
139 :
140 : /*
141 : * Allocate a new peer group ID
142 : */
143 219 : static int mnt_alloc_group_id(struct mount *mnt)
144 : {
145 219 : int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
146 :
147 219 : if (res < 0)
148 : return res;
149 219 : mnt->mnt_group_id = res;
150 219 : return 0;
151 : }
152 :
153 : /*
154 : * Release a peer group ID
155 : */
156 161 : void mnt_release_group_id(struct mount *mnt)
157 : {
158 161 : ida_free(&mnt_group_ida, mnt->mnt_group_id);
159 161 : mnt->mnt_group_id = 0;
160 0 : }
161 :
162 : /*
163 : * vfsmount lock must be held for read
164 : */
165 311560 : static inline void mnt_add_count(struct mount *mnt, int n)
166 : {
167 : #ifdef CONFIG_SMP
168 61141 : this_cpu_add(mnt->mnt_pcp->mnt_count, n);
169 : #else
170 : preempt_disable();
171 : mnt->mnt_count += n;
172 : preempt_enable();
173 : #endif
174 61156 : }
175 :
176 : /*
177 : * vfsmount lock must be held for write
178 : */
179 1526 : int mnt_get_count(struct mount *mnt)
180 : {
181 : #ifdef CONFIG_SMP
182 1526 : int count = 0;
183 1526 : int cpu;
184 :
185 7630 : for_each_possible_cpu(cpu) {
186 6104 : count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
187 : }
188 :
189 1526 : return count;
190 : #else
191 : return mnt->mnt_count;
192 : #endif
193 : }
194 :
195 1171 : static struct mount *alloc_vfsmnt(const char *name)
196 : {
197 1171 : struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
198 1171 : if (mnt) {
199 1171 : int err;
200 :
201 1171 : err = mnt_alloc_id(mnt);
202 1171 : if (err)
203 0 : goto out_free_cache;
204 :
205 1171 : if (name) {
206 1171 : mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
207 1171 : if (!mnt->mnt_devname)
208 0 : goto out_free_id;
209 : }
210 :
211 : #ifdef CONFIG_SMP
212 1171 : mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
213 1171 : if (!mnt->mnt_pcp)
214 0 : goto out_free_devname;
215 :
216 1171 : this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
217 : #else
218 : mnt->mnt_count = 1;
219 : mnt->mnt_writers = 0;
220 : #endif
221 :
222 1171 : INIT_HLIST_NODE(&mnt->mnt_hash);
223 1171 : INIT_LIST_HEAD(&mnt->mnt_child);
224 1171 : INIT_LIST_HEAD(&mnt->mnt_mounts);
225 1171 : INIT_LIST_HEAD(&mnt->mnt_list);
226 1171 : INIT_LIST_HEAD(&mnt->mnt_expire);
227 1171 : INIT_LIST_HEAD(&mnt->mnt_share);
228 1171 : INIT_LIST_HEAD(&mnt->mnt_slave_list);
229 1171 : INIT_LIST_HEAD(&mnt->mnt_slave);
230 1171 : INIT_HLIST_NODE(&mnt->mnt_mp_list);
231 1171 : INIT_LIST_HEAD(&mnt->mnt_umounting);
232 1171 : INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
233 1171 : mnt->mnt.mnt_userns = &init_user_ns;
234 : }
235 : return mnt;
236 :
237 : #ifdef CONFIG_SMP
238 0 : out_free_devname:
239 0 : kfree_const(mnt->mnt_devname);
240 : #endif
241 0 : out_free_id:
242 0 : mnt_free_id(mnt);
243 0 : out_free_cache:
244 0 : kmem_cache_free(mnt_cache, mnt);
245 0 : return NULL;
246 : }
247 :
248 : /*
249 : * Most r/o checks on a fs are for operations that take
250 : * discrete amounts of time, like a write() or unlink().
251 : * We must keep track of when those operations start
252 : * (for permission checks) and when they end, so that
253 : * we can determine when writes are able to occur to
254 : * a filesystem.
255 : */
256 : /*
257 : * __mnt_is_readonly: check whether a mount is read-only
258 : * @mnt: the mount to check for its write status
259 : *
260 : * This shouldn't be used directly ouside of the VFS.
261 : * It does not guarantee that the filesystem will stay
262 : * r/w, just that it is right *now*. This can not and
263 : * should not be used in place of IS_RDONLY(inode).
264 : * mnt_want/drop_write() will _keep_ the filesystem
265 : * r/w.
266 : */
267 14461 : bool __mnt_is_readonly(struct vfsmount *mnt)
268 : {
269 14312 : return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
270 : }
271 : EXPORT_SYMBOL_GPL(__mnt_is_readonly);
272 :
273 13387 : static inline void mnt_inc_writers(struct mount *mnt)
274 : {
275 : #ifdef CONFIG_SMP
276 26774 : this_cpu_inc(mnt->mnt_pcp->mnt_writers);
277 : #else
278 : mnt->mnt_writers++;
279 : #endif
280 : }
281 :
282 13376 : static inline void mnt_dec_writers(struct mount *mnt)
283 : {
284 : #ifdef CONFIG_SMP
285 13263 : this_cpu_dec(mnt->mnt_pcp->mnt_writers);
286 : #else
287 : mnt->mnt_writers--;
288 : #endif
289 : }
290 :
291 1101 : static unsigned int mnt_get_writers(struct mount *mnt)
292 : {
293 : #ifdef CONFIG_SMP
294 1101 : unsigned int count = 0;
295 1101 : int cpu;
296 :
297 5505 : for_each_possible_cpu(cpu) {
298 4404 : count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
299 : }
300 :
301 1101 : return count;
302 : #else
303 : return mnt->mnt_writers;
304 : #endif
305 : }
306 :
307 13387 : static int mnt_is_readonly(struct vfsmount *mnt)
308 : {
309 13387 : if (mnt->mnt_sb->s_readonly_remount)
310 : return 1;
311 : /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
312 13387 : smp_rmb();
313 26661 : return __mnt_is_readonly(mnt);
314 : }
315 :
316 : /*
317 : * Most r/o & frozen checks on a fs are for operations that take discrete
318 : * amounts of time, like a write() or unlink(). We must keep track of when
319 : * those operations start (for permission checks) and when they end, so that we
320 : * can determine when writes are able to occur to a filesystem.
321 : */
322 : /**
323 : * __mnt_want_write - get write access to a mount without freeze protection
324 : * @m: the mount on which to take a write
325 : *
326 : * This tells the low-level filesystem that a write is about to be performed to
327 : * it, and makes sure that writes are allowed (mnt it read-write) before
328 : * returning success. This operation does not protect against filesystem being
329 : * frozen. When the write operation is finished, __mnt_drop_write() must be
330 : * called. This is effectively a refcount.
331 : */
332 13387 : int __mnt_want_write(struct vfsmount *m)
333 : {
334 13387 : struct mount *mnt = real_mount(m);
335 13387 : int ret = 0;
336 :
337 13387 : preempt_disable();
338 13387 : mnt_inc_writers(mnt);
339 : /*
340 : * The store to mnt_inc_writers must be visible before we pass
341 : * MNT_WRITE_HOLD loop below, so that the slowpath can see our
342 : * incremented count after it has set MNT_WRITE_HOLD.
343 : */
344 13387 : smp_mb();
345 13387 : while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
346 0 : cpu_relax();
347 : /*
348 : * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
349 : * be set to match its requirements. So we must not load that until
350 : * MNT_WRITE_HOLD is cleared.
351 : */
352 13387 : smp_rmb();
353 13387 : if (mnt_is_readonly(m)) {
354 113 : mnt_dec_writers(mnt);
355 113 : ret = -EROFS;
356 : }
357 13387 : preempt_enable();
358 :
359 13387 : return ret;
360 : }
361 :
362 : /**
363 : * mnt_want_write - get write access to a mount
364 : * @m: the mount on which to take a write
365 : *
366 : * This tells the low-level filesystem that a write is about to be performed to
367 : * it, and makes sure that writes are allowed (mount is read-write, filesystem
368 : * is not frozen) before returning success. When the write operation is
369 : * finished, mnt_drop_write() must be called. This is effectively a refcount.
370 : */
371 8526 : int mnt_want_write(struct vfsmount *m)
372 : {
373 8526 : int ret;
374 :
375 8526 : sb_start_write(m->mnt_sb);
376 8526 : ret = __mnt_want_write(m);
377 8526 : if (ret)
378 17 : sb_end_write(m->mnt_sb);
379 8526 : return ret;
380 : }
381 : EXPORT_SYMBOL_GPL(mnt_want_write);
382 :
383 : /**
384 : * __mnt_want_write_file - get write access to a file's mount
385 : * @file: the file who's mount on which to take a write
386 : *
387 : * This is like __mnt_want_write, but if the file is already open for writing it
388 : * skips incrementing mnt_writers (since the open file already has a reference)
389 : * and instead only does the check for emergency r/o remounts. This must be
390 : * paired with __mnt_drop_write_file.
391 : */
392 1284 : int __mnt_want_write_file(struct file *file)
393 : {
394 1284 : if (file->f_mode & FMODE_WRITER) {
395 : /*
396 : * Superblock may have become readonly while there are still
397 : * writable fd's, e.g. due to a fs error with errors=remount-ro
398 : */
399 1696 : if (__mnt_is_readonly(file->f_path.mnt))
400 : return -EROFS;
401 848 : return 0;
402 : }
403 436 : return __mnt_want_write(file->f_path.mnt);
404 : }
405 :
406 : /**
407 : * mnt_want_write_file - get write access to a file's mount
408 : * @file: the file who's mount on which to take a write
409 : *
410 : * This is like mnt_want_write, but if the file is already open for writing it
411 : * skips incrementing mnt_writers (since the open file already has a reference)
412 : * and instead only does the freeze protection and the check for emergency r/o
413 : * remounts. This must be paired with mnt_drop_write_file.
414 : */
415 68 : int mnt_want_write_file(struct file *file)
416 : {
417 68 : int ret;
418 :
419 68 : sb_start_write(file_inode(file)->i_sb);
420 68 : ret = __mnt_want_write_file(file);
421 68 : if (ret)
422 0 : sb_end_write(file_inode(file)->i_sb);
423 68 : return ret;
424 : }
425 : EXPORT_SYMBOL_GPL(mnt_want_write_file);
426 :
427 : /**
428 : * __mnt_drop_write - give up write access to a mount
429 : * @mnt: the mount on which to give up write access
430 : *
431 : * Tells the low-level filesystem that we are done
432 : * performing writes to it. Must be matched with
433 : * __mnt_want_write() call above.
434 : */
435 13263 : void __mnt_drop_write(struct vfsmount *mnt)
436 : {
437 4318 : preempt_disable();
438 13263 : mnt_dec_writers(real_mount(mnt));
439 13263 : preempt_enable();
440 436 : }
441 :
442 : /**
443 : * mnt_drop_write - give up write access to a mount
444 : * @mnt: the mount on which to give up write access
445 : *
446 : * Tells the low-level filesystem that we are done performing writes to it and
447 : * also allows filesystem to be frozen again. Must be matched with
448 : * mnt_want_write() call above.
449 : */
450 8509 : void mnt_drop_write(struct vfsmount *mnt)
451 : {
452 8509 : __mnt_drop_write(mnt);
453 8509 : sb_end_write(mnt->mnt_sb);
454 8509 : }
455 : EXPORT_SYMBOL_GPL(mnt_drop_write);
456 :
457 1284 : void __mnt_drop_write_file(struct file *file)
458 : {
459 1284 : if (!(file->f_mode & FMODE_WRITER))
460 436 : __mnt_drop_write(file->f_path.mnt);
461 1284 : }
462 :
463 68 : void mnt_drop_write_file(struct file *file)
464 : {
465 68 : __mnt_drop_write_file(file);
466 68 : sb_end_write(file_inode(file)->i_sb);
467 68 : }
468 : EXPORT_SYMBOL(mnt_drop_write_file);
469 :
470 31 : static inline int mnt_hold_writers(struct mount *mnt)
471 : {
472 31 : mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
473 : /*
474 : * After storing MNT_WRITE_HOLD, we'll read the counters. This store
475 : * should be visible before we do.
476 : */
477 31 : smp_mb();
478 :
479 : /*
480 : * With writers on hold, if this value is zero, then there are
481 : * definitely no active writers (although held writers may subsequently
482 : * increment the count, they'll have to wait, and decrement it after
483 : * seeing MNT_READONLY).
484 : *
485 : * It is OK to have counter incremented on one CPU and decremented on
486 : * another: the sum will add up correctly. The danger would be when we
487 : * sum up each counter, if we read a counter before it is incremented,
488 : * but then read another CPU's count which it has been subsequently
489 : * decremented from -- we would see more decrements than we should.
490 : * MNT_WRITE_HOLD protects against this scenario, because
491 : * mnt_want_write first increments count, then smp_mb, then spins on
492 : * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
493 : * we're counting up here.
494 : */
495 31 : if (mnt_get_writers(mnt) > 0)
496 0 : return -EBUSY;
497 :
498 : return 0;
499 : }
500 :
501 31 : static inline void mnt_unhold_writers(struct mount *mnt)
502 : {
503 : /*
504 : * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
505 : * that become unheld will see MNT_READONLY.
506 : */
507 31 : smp_wmb();
508 31 : mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
509 0 : }
510 :
511 31 : static int mnt_make_readonly(struct mount *mnt)
512 : {
513 31 : int ret;
514 :
515 31 : ret = mnt_hold_writers(mnt);
516 31 : if (!ret)
517 31 : mnt->mnt.mnt_flags |= MNT_READONLY;
518 31 : mnt_unhold_writers(mnt);
519 31 : return ret;
520 : }
521 :
522 1 : int sb_prepare_remount_readonly(struct super_block *sb)
523 : {
524 1 : struct mount *mnt;
525 1 : int err = 0;
526 :
527 : /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
528 1 : if (atomic_long_read(&sb->s_remove_count))
529 : return -EBUSY;
530 :
531 1 : lock_mount_hash();
532 2 : list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
533 1 : if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
534 1 : mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
535 1 : smp_mb();
536 1 : if (mnt_get_writers(mnt) > 0) {
537 : err = -EBUSY;
538 : break;
539 : }
540 : }
541 : }
542 1 : if (!err && atomic_long_read(&sb->s_remove_count))
543 : err = -EBUSY;
544 :
545 1 : if (!err) {
546 1 : sb->s_readonly_remount = 1;
547 1 : smp_wmb();
548 : }
549 2 : list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
550 1 : if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
551 1 : mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
552 : }
553 1 : unlock_mount_hash();
554 :
555 1 : return err;
556 : }
557 :
558 1069 : static void free_vfsmnt(struct mount *mnt)
559 : {
560 1069 : struct user_namespace *mnt_userns;
561 :
562 1069 : mnt_userns = mnt_user_ns(&mnt->mnt);
563 1069 : if (mnt_userns != &init_user_ns)
564 1069 : put_user_ns(mnt_userns);
565 1069 : kfree_const(mnt->mnt_devname);
566 : #ifdef CONFIG_SMP
567 1069 : free_percpu(mnt->mnt_pcp);
568 : #endif
569 1069 : kmem_cache_free(mnt_cache, mnt);
570 1069 : }
571 :
572 1069 : static void delayed_free_vfsmnt(struct rcu_head *head)
573 : {
574 1069 : free_vfsmnt(container_of(head, struct mount, mnt_rcu));
575 1069 : }
576 :
577 : /* call under rcu_read_lock */
578 96429 : int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
579 : {
580 96429 : struct mount *mnt;
581 96429 : if (read_seqretry(&mount_lock, seq))
582 : return 1;
583 96364 : if (bastard == NULL)
584 : return 0;
585 96128 : mnt = real_mount(bastard);
586 96128 : mnt_add_count(mnt, 1);
587 96131 : smp_mb(); // see mntput_no_expire()
588 96139 : if (likely(!read_seqretry(&mount_lock, seq)))
589 : return 0;
590 2 : if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
591 0 : mnt_add_count(mnt, -1);
592 0 : return 1;
593 : }
594 2 : lock_mount_hash();
595 2 : if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
596 0 : mnt_add_count(mnt, -1);
597 0 : unlock_mount_hash();
598 0 : return 1;
599 : }
600 2 : unlock_mount_hash();
601 : /* caller will mntput() */
602 2 : return -1;
603 : }
604 :
605 : /* call under rcu_read_lock */
606 5237 : bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
607 : {
608 5237 : int res = __legitimize_mnt(bastard, seq);
609 5239 : if (likely(!res))
610 : return true;
611 7 : if (unlikely(res < 0)) {
612 0 : rcu_read_unlock();
613 0 : mntput(bastard);
614 0 : rcu_read_lock();
615 : }
616 : return false;
617 : }
618 :
619 : /*
620 : * find the first mount at @dentry on vfsmount @mnt.
621 : * call under rcu_read_lock()
622 : */
623 35304 : struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
624 : {
625 35304 : struct hlist_head *head = m_hash(mnt, dentry);
626 35304 : struct mount *p;
627 :
628 71346 : hlist_for_each_entry_rcu(p, head, mnt_hash)
629 29872 : if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
630 29134 : return p;
631 : return NULL;
632 : }
633 :
634 : /*
635 : * lookup_mnt - Return the first child mount mounted at path
636 : *
637 : * "First" means first mounted chronologically. If you create the
638 : * following mounts:
639 : *
640 : * mount /dev/sda1 /mnt
641 : * mount /dev/sda2 /mnt
642 : * mount /dev/sda3 /mnt
643 : *
644 : * Then lookup_mnt() on the base /mnt dentry in the root mount will
645 : * return successively the root dentry and vfsmount of /dev/sda1, then
646 : * /dev/sda2, then /dev/sda3, then NULL.
647 : *
648 : * lookup_mnt takes a reference to the found vfsmount.
649 : */
650 3091 : struct vfsmount *lookup_mnt(const struct path *path)
651 : {
652 3091 : struct mount *child_mnt;
653 3091 : struct vfsmount *m;
654 3091 : unsigned seq;
655 :
656 3091 : rcu_read_lock();
657 3091 : do {
658 3091 : seq = read_seqbegin(&mount_lock);
659 3091 : child_mnt = __lookup_mnt(path->mnt, path->dentry);
660 3091 : m = child_mnt ? &child_mnt->mnt : NULL;
661 3091 : } while (!legitimize_mnt(m, seq));
662 3091 : rcu_read_unlock();
663 3091 : return m;
664 : }
665 :
666 6769 : static inline void lock_ns_list(struct mnt_namespace *ns)
667 : {
668 13539 : spin_lock(&ns->ns_lock);
669 : }
670 :
671 6770 : static inline void unlock_ns_list(struct mnt_namespace *ns)
672 : {
673 13540 : spin_unlock(&ns->ns_lock);
674 : }
675 :
676 8476 : static inline bool mnt_is_cursor(struct mount *mnt)
677 : {
678 8476 : return mnt->mnt.mnt_flags & MNT_CURSOR;
679 : }
680 :
681 : /*
682 : * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
683 : * current mount namespace.
684 : *
685 : * The common case is dentries are not mountpoints at all and that
686 : * test is handled inline. For the slow case when we are actually
687 : * dealing with a mountpoint of some kind, walk through all of the
688 : * mounts in the current mount namespace and test to see if the dentry
689 : * is a mountpoint.
690 : *
691 : * The mount_hashtable is not usable in the context because we
692 : * need to identify all mounts that may be in the current mount
693 : * namespace not just a mount that happens to have some specified
694 : * parent mount.
695 : */
696 185 : bool __is_local_mountpoint(struct dentry *dentry)
697 : {
698 185 : struct mnt_namespace *ns = current->nsproxy->mnt_ns;
699 185 : struct mount *mnt;
700 185 : bool is_covered = false;
701 :
702 185 : down_read(&namespace_sem);
703 185 : lock_ns_list(ns);
704 3007 : list_for_each_entry(mnt, &ns->list, mnt_list) {
705 3007 : if (mnt_is_cursor(mnt))
706 0 : continue;
707 3007 : is_covered = (mnt->mnt_mountpoint == dentry);
708 3007 : if (is_covered)
709 : break;
710 : }
711 185 : unlock_ns_list(ns);
712 185 : up_read(&namespace_sem);
713 :
714 185 : return is_covered;
715 : }
716 :
717 29 : static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
718 : {
719 29 : struct hlist_head *chain = mp_hash(dentry);
720 29 : struct mountpoint *mp;
721 :
722 58 : hlist_for_each_entry(mp, chain, m_hash) {
723 29 : if (mp->m_dentry == dentry) {
724 29 : mp->m_count++;
725 29 : return mp;
726 : }
727 : }
728 : return NULL;
729 : }
730 :
731 320 : static struct mountpoint *get_mountpoint(struct dentry *dentry)
732 : {
733 320 : struct mountpoint *mp, *new = NULL;
734 320 : int ret;
735 :
736 320 : if (d_mountpoint(dentry)) {
737 : /* might be worth a WARN_ON() */
738 29 : if (d_unlinked(dentry))
739 320 : return ERR_PTR(-ENOENT);
740 29 : mountpoint:
741 29 : read_seqlock_excl(&mount_lock);
742 29 : mp = lookup_mountpoint(dentry);
743 29 : read_sequnlock_excl(&mount_lock);
744 29 : if (mp)
745 29 : goto done;
746 : }
747 :
748 0 : if (!new)
749 291 : new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
750 291 : if (!new)
751 320 : return ERR_PTR(-ENOMEM);
752 :
753 :
754 : /* Exactly one processes may set d_mounted */
755 291 : ret = d_set_mounted(dentry);
756 :
757 : /* Someone else set d_mounted? */
758 291 : if (ret == -EBUSY)
759 0 : goto mountpoint;
760 :
761 : /* The dentry is not available as a mountpoint? */
762 291 : mp = ERR_PTR(ret);
763 291 : if (ret)
764 0 : goto done;
765 :
766 : /* Add the new mountpoint to the hash table */
767 291 : read_seqlock_excl(&mount_lock);
768 291 : new->m_dentry = dget(dentry);
769 291 : new->m_count = 1;
770 291 : hlist_add_head(&new->m_hash, mp_hash(dentry));
771 291 : INIT_HLIST_HEAD(&new->m_list);
772 291 : read_sequnlock_excl(&mount_lock);
773 :
774 291 : mp = new;
775 291 : new = NULL;
776 320 : done:
777 320 : kfree(new);
778 320 : return mp;
779 : }
780 :
781 : /*
782 : * vfsmount lock must be held. Additionally, the caller is responsible
783 : * for serializing calls for given disposal list.
784 : */
785 1347 : static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
786 : {
787 1347 : if (!--mp->m_count) {
788 257 : struct dentry *dentry = mp->m_dentry;
789 257 : BUG_ON(!hlist_empty(&mp->m_list));
790 257 : spin_lock(&dentry->d_lock);
791 257 : dentry->d_flags &= ~DCACHE_MOUNTED;
792 257 : spin_unlock(&dentry->d_lock);
793 257 : dput_to_list(dentry, list);
794 257 : hlist_del(&mp->m_hash);
795 257 : kfree(mp);
796 : }
797 1347 : }
798 :
799 : /* called with namespace_lock and vfsmount lock */
800 1347 : static void put_mountpoint(struct mountpoint *mp)
801 : {
802 1347 : __put_mountpoint(mp, &ex_mountpoints);
803 8 : }
804 :
805 2448 : static inline int check_mnt(struct mount *mnt)
806 : {
807 9 : return mnt->mnt_ns == current->nsproxy->mnt_ns;
808 : }
809 :
810 : /*
811 : * vfsmount lock must be held for write
812 : */
813 429 : static void touch_mnt_namespace(struct mnt_namespace *ns)
814 : {
815 429 : if (ns) {
816 429 : ns->event = ++event;
817 429 : wake_up_interruptible(&ns->poll);
818 : }
819 429 : }
820 :
821 : /*
822 : * vfsmount lock must be held for write
823 : */
824 1064 : static void __touch_mnt_namespace(struct mnt_namespace *ns)
825 : {
826 1064 : if (ns && ns->event != event) {
827 244 : ns->event = event;
828 244 : wake_up_interruptible(&ns->poll);
829 : }
830 1064 : }
831 :
832 : /*
833 : * vfsmount lock must be held for write
834 : */
835 1027 : static struct mountpoint *unhash_mnt(struct mount *mnt)
836 : {
837 1027 : struct mountpoint *mp;
838 1027 : mnt->mnt_parent = mnt;
839 1027 : mnt->mnt_mountpoint = mnt->mnt.mnt_root;
840 1027 : list_del_init(&mnt->mnt_child);
841 1027 : hlist_del_init_rcu(&mnt->mnt_hash);
842 1027 : hlist_del_init(&mnt->mnt_mp_list);
843 1027 : mp = mnt->mnt_mp;
844 1027 : mnt->mnt_mp = NULL;
845 1027 : return mp;
846 : }
847 :
848 : /*
849 : * vfsmount lock must be held for write
850 : */
851 1018 : static void umount_mnt(struct mount *mnt)
852 : {
853 1018 : put_mountpoint(unhash_mnt(mnt));
854 1018 : }
855 :
856 : /*
857 : * vfsmount lock must be held for write
858 : */
859 1114 : void mnt_set_mountpoint(struct mount *mnt,
860 : struct mountpoint *mp,
861 : struct mount *child_mnt)
862 : {
863 1114 : mp->m_count++;
864 1114 : mnt_add_count(mnt, 1); /* essentially, that's mntget */
865 1114 : child_mnt->mnt_mountpoint = mp->m_dentry;
866 1114 : child_mnt->mnt_parent = mnt;
867 1114 : child_mnt->mnt_mp = mp;
868 1114 : hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
869 1114 : }
870 :
871 1114 : static void __attach_mnt(struct mount *mnt, struct mount *parent)
872 : {
873 1114 : hlist_add_head_rcu(&mnt->mnt_hash,
874 : m_hash(&parent->mnt, mnt->mnt_mountpoint));
875 1114 : list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
876 1114 : }
877 :
878 : /*
879 : * vfsmount lock must be held for write
880 : */
881 729 : static void attach_mnt(struct mount *mnt,
882 : struct mount *parent,
883 : struct mountpoint *mp)
884 : {
885 729 : mnt_set_mountpoint(parent, mp, mnt);
886 729 : __attach_mnt(mnt, parent);
887 729 : }
888 :
889 0 : void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
890 : {
891 0 : struct mountpoint *old_mp = mnt->mnt_mp;
892 0 : struct mount *old_parent = mnt->mnt_parent;
893 :
894 0 : list_del_init(&mnt->mnt_child);
895 0 : hlist_del_init(&mnt->mnt_mp_list);
896 0 : hlist_del_init_rcu(&mnt->mnt_hash);
897 :
898 0 : attach_mnt(mnt, parent, mp);
899 :
900 0 : put_mountpoint(old_mp);
901 0 : mnt_add_count(old_parent, -1);
902 0 : }
903 :
904 : /*
905 : * vfsmount lock must be held for write
906 : */
907 385 : static void commit_tree(struct mount *mnt)
908 : {
909 385 : struct mount *parent = mnt->mnt_parent;
910 385 : struct mount *m;
911 385 : LIST_HEAD(head);
912 385 : struct mnt_namespace *n = parent->mnt_ns;
913 :
914 385 : BUG_ON(parent == mnt);
915 :
916 385 : list_add_tail(&head, &mnt->mnt_list);
917 806 : list_for_each_entry(m, &head, mnt_list)
918 421 : m->mnt_ns = n;
919 :
920 385 : list_splice(&head, n->list.prev);
921 :
922 385 : n->mounts += n->pending_mounts;
923 385 : n->pending_mounts = 0;
924 :
925 385 : __attach_mnt(mnt, parent);
926 385 : touch_mnt_namespace(n);
927 385 : }
928 :
929 3852 : static struct mount *next_mnt(struct mount *p, struct mount *root)
930 : {
931 3852 : struct list_head *next = p->mnt_mounts.next;
932 286 : if (next == &p->mnt_mounts) {
933 3852 : while (1) {
934 3852 : if (p == root)
935 : return NULL;
936 2990 : next = p->mnt_child.next;
937 2990 : if (next != &p->mnt_parent->mnt_mounts)
938 : break;
939 : p = p->mnt_parent;
940 : }
941 : }
942 2990 : return list_entry(next, struct mount, mnt_child);
943 : }
944 :
945 0 : static struct mount *skip_mnt_tree(struct mount *p)
946 : {
947 0 : struct list_head *prev = p->mnt_mounts.prev;
948 0 : while (prev != &p->mnt_mounts) {
949 0 : p = list_entry(prev, struct mount, mnt_child);
950 0 : prev = p->mnt_mounts.prev;
951 : }
952 0 : return p;
953 : }
954 :
955 : /**
956 : * vfs_create_mount - Create a mount for a configured superblock
957 : * @fc: The configuration context with the superblock attached
958 : *
959 : * Create a mount to an already configured superblock. If necessary, the
960 : * caller should invoke vfs_get_tree() before calling this.
961 : *
962 : * Note that this does not attach the mount to anything.
963 : */
964 127 : struct vfsmount *vfs_create_mount(struct fs_context *fc)
965 : {
966 127 : struct mount *mnt;
967 :
968 127 : if (!fc->root)
969 127 : return ERR_PTR(-EINVAL);
970 :
971 129 : mnt = alloc_vfsmnt(fc->source ?: "none");
972 127 : if (!mnt)
973 127 : return ERR_PTR(-ENOMEM);
974 :
975 127 : if (fc->sb_flags & SB_KERNMOUNT)
976 10 : mnt->mnt.mnt_flags = MNT_INTERNAL;
977 :
978 127 : atomic_inc(&fc->root->d_sb->s_active);
979 127 : mnt->mnt.mnt_sb = fc->root->d_sb;
980 127 : mnt->mnt.mnt_root = dget(fc->root);
981 127 : mnt->mnt_mountpoint = mnt->mnt.mnt_root;
982 127 : mnt->mnt_parent = mnt;
983 :
984 127 : lock_mount_hash();
985 127 : list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
986 127 : unlock_mount_hash();
987 127 : return &mnt->mnt;
988 : }
989 : EXPORT_SYMBOL(vfs_create_mount);
990 :
991 12 : struct vfsmount *fc_mount(struct fs_context *fc)
992 : {
993 12 : int err = vfs_get_tree(fc);
994 12 : if (!err) {
995 12 : up_write(&fc->root->d_sb->s_umount);
996 12 : return vfs_create_mount(fc);
997 : }
998 0 : return ERR_PTR(err);
999 : }
1000 : EXPORT_SYMBOL(fc_mount);
1001 :
1002 11 : struct vfsmount *vfs_kern_mount(struct file_system_type *type,
1003 : int flags, const char *name,
1004 : void *data)
1005 : {
1006 11 : struct fs_context *fc;
1007 11 : struct vfsmount *mnt;
1008 11 : int ret = 0;
1009 :
1010 11 : if (!type)
1011 11 : return ERR_PTR(-EINVAL);
1012 :
1013 11 : fc = fs_context_for_mount(type, flags);
1014 11 : if (IS_ERR(fc))
1015 11 : return ERR_CAST(fc);
1016 :
1017 11 : if (name)
1018 11 : ret = vfs_parse_fs_string(fc, "source",
1019 : name, strlen(name));
1020 11 : if (!ret)
1021 11 : ret = parse_monolithic_mount_data(fc, data);
1022 11 : if (!ret)
1023 11 : mnt = fc_mount(fc);
1024 : else
1025 0 : mnt = ERR_PTR(ret);
1026 :
1027 11 : put_fs_context(fc);
1028 11 : return mnt;
1029 : }
1030 : EXPORT_SYMBOL_GPL(vfs_kern_mount);
1031 :
1032 : struct vfsmount *
1033 0 : vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
1034 : const char *name, void *data)
1035 : {
1036 : /* Until it is worked out how to pass the user namespace
1037 : * through from the parent mount to the submount don't support
1038 : * unprivileged mounts with submounts.
1039 : */
1040 0 : if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1041 0 : return ERR_PTR(-EPERM);
1042 :
1043 0 : return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
1044 : }
1045 : EXPORT_SYMBOL_GPL(vfs_submount);
1046 :
1047 1044 : static struct mount *clone_mnt(struct mount *old, struct dentry *root,
1048 : int flag)
1049 : {
1050 1044 : struct super_block *sb = old->mnt.mnt_sb;
1051 1044 : struct mount *mnt;
1052 1044 : int err;
1053 :
1054 1044 : mnt = alloc_vfsmnt(old->mnt_devname);
1055 1044 : if (!mnt)
1056 1044 : return ERR_PTR(-ENOMEM);
1057 :
1058 1044 : if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1059 192 : mnt->mnt_group_id = 0; /* not a peer of original */
1060 : else
1061 852 : mnt->mnt_group_id = old->mnt_group_id;
1062 :
1063 1044 : if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1064 94 : err = mnt_alloc_group_id(mnt);
1065 94 : if (err)
1066 0 : goto out_free;
1067 : }
1068 :
1069 1044 : mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1070 1044 : mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
1071 :
1072 1044 : atomic_inc(&sb->s_active);
1073 1044 : mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
1074 1044 : if (mnt->mnt.mnt_userns != &init_user_ns)
1075 0 : mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
1076 1044 : mnt->mnt.mnt_sb = sb;
1077 1044 : mnt->mnt.mnt_root = dget(root);
1078 1044 : mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1079 1044 : mnt->mnt_parent = mnt;
1080 1044 : lock_mount_hash();
1081 1044 : list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
1082 1044 : unlock_mount_hash();
1083 :
1084 1044 : if ((flag & CL_SLAVE) ||
1085 856 : ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1086 188 : list_add(&mnt->mnt_slave, &old->mnt_slave_list);
1087 188 : mnt->mnt_master = old;
1088 188 : CLEAR_MNT_SHARED(mnt);
1089 856 : } else if (!(flag & CL_PRIVATE)) {
1090 852 : if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
1091 729 : list_add(&mnt->mnt_share, &old->mnt_share);
1092 852 : if (IS_MNT_SLAVE(old))
1093 71 : list_add(&mnt->mnt_slave, &old->mnt_slave);
1094 852 : mnt->mnt_master = old->mnt_master;
1095 : } else {
1096 4 : CLEAR_MNT_SHARED(mnt);
1097 : }
1098 1044 : if (flag & CL_MAKE_SHARED)
1099 140 : set_mnt_shared(mnt);
1100 :
1101 : /* stick the duplicate mount on the same expiry list
1102 : * as the original if that was on one */
1103 1044 : if (flag & CL_EXPIRE) {
1104 733 : if (!list_empty(&old->mnt_expire))
1105 0 : list_add(&mnt->mnt_expire, &old->mnt_expire);
1106 : }
1107 :
1108 : return mnt;
1109 :
1110 0 : out_free:
1111 0 : mnt_free_id(mnt);
1112 0 : free_vfsmnt(mnt);
1113 0 : return ERR_PTR(err);
1114 : }
1115 :
1116 1069 : static void cleanup_mnt(struct mount *mnt)
1117 : {
1118 1069 : struct hlist_node *p;
1119 1069 : struct mount *m;
1120 : /*
1121 : * The warning here probably indicates that somebody messed
1122 : * up a mnt_want/drop_write() pair. If this happens, the
1123 : * filesystem was probably unable to make r/w->r/o transitions.
1124 : * The locking used to deal with mnt_count decrement provides barriers,
1125 : * so mnt_get_writers() below is safe.
1126 : */
1127 1069 : WARN_ON(mnt_get_writers(mnt));
1128 1069 : if (unlikely(mnt->mnt_pins.first))
1129 0 : mnt_pin_kill(mnt);
1130 2138 : hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1131 0 : hlist_del(&m->mnt_umount);
1132 0 : mntput(&m->mnt);
1133 : }
1134 1069 : fsnotify_vfsmount_delete(&mnt->mnt);
1135 1069 : dput(mnt->mnt.mnt_root);
1136 1069 : deactivate_super(mnt->mnt.mnt_sb);
1137 1069 : mnt_free_id(mnt);
1138 1069 : call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1139 1069 : }
1140 :
1141 1069 : static void __cleanup_mnt(struct rcu_head *head)
1142 : {
1143 1069 : cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1144 1069 : }
1145 :
1146 : static LLIST_HEAD(delayed_mntput_list);
1147 0 : static void delayed_mntput(struct work_struct *unused)
1148 : {
1149 0 : struct llist_node *node = llist_del_all(&delayed_mntput_list);
1150 0 : struct mount *m, *t;
1151 :
1152 0 : llist_for_each_entry_safe(m, t, node, mnt_llist)
1153 0 : cleanup_mnt(m);
1154 0 : }
1155 : static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1156 :
1157 152124 : static void mntput_no_expire(struct mount *mnt)
1158 : {
1159 152124 : LIST_HEAD(list);
1160 152124 : int count;
1161 :
1162 152124 : rcu_read_lock();
1163 152159 : if (likely(READ_ONCE(mnt->mnt_ns))) {
1164 : /*
1165 : * Since we don't do lock_mount_hash() here,
1166 : * ->mnt_ns can change under us. However, if it's
1167 : * non-NULL, then there's a reference that won't
1168 : * be dropped until after an RCU delay done after
1169 : * turning ->mnt_ns NULL. So if we observe it
1170 : * non-NULL under rcu_read_lock(), the reference
1171 : * we are dropping is not the final one.
1172 : */
1173 150968 : mnt_add_count(mnt, -1);
1174 150978 : rcu_read_unlock();
1175 303121 : return;
1176 : }
1177 1191 : lock_mount_hash();
1178 : /*
1179 : * make sure that if __legitimize_mnt() has not seen us grab
1180 : * mount_lock, we'll see their refcount increment here.
1181 : */
1182 1191 : smp_mb();
1183 1191 : mnt_add_count(mnt, -1);
1184 1191 : count = mnt_get_count(mnt);
1185 1191 : if (count != 0) {
1186 122 : WARN_ON(count < 0);
1187 122 : rcu_read_unlock();
1188 122 : unlock_mount_hash();
1189 122 : return;
1190 : }
1191 1069 : if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1192 0 : rcu_read_unlock();
1193 0 : unlock_mount_hash();
1194 0 : return;
1195 : }
1196 1069 : mnt->mnt.mnt_flags |= MNT_DOOMED;
1197 1069 : rcu_read_unlock();
1198 :
1199 1069 : list_del(&mnt->mnt_instance);
1200 :
1201 1069 : if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1202 0 : struct mount *p, *tmp;
1203 0 : list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1204 0 : __put_mountpoint(unhash_mnt(p), &list);
1205 0 : hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1206 : }
1207 : }
1208 1069 : unlock_mount_hash();
1209 1069 : shrink_dentry_list(&list);
1210 :
1211 1069 : if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1212 1069 : struct task_struct *task = current;
1213 1069 : if (likely(!(task->flags & PF_KTHREAD))) {
1214 1069 : init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1215 1069 : if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1216 : return;
1217 : }
1218 0 : if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1219 0 : schedule_delayed_work(&delayed_mntput_work, 1);
1220 0 : return;
1221 : }
1222 0 : cleanup_mnt(mnt);
1223 : }
1224 :
1225 210762 : void mntput(struct vfsmount *mnt)
1226 : {
1227 210762 : if (mnt) {
1228 152013 : struct mount *m = real_mount(mnt);
1229 : /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1230 152013 : if (unlikely(m->mnt_expiry_mark))
1231 0 : m->mnt_expiry_mark = 0;
1232 152013 : mntput_no_expire(m);
1233 : }
1234 210768 : }
1235 : EXPORT_SYMBOL(mntput);
1236 :
1237 61141 : struct vfsmount *mntget(struct vfsmount *mnt)
1238 : {
1239 61041 : if (mnt)
1240 61141 : mnt_add_count(real_mount(mnt), 1);
1241 61156 : return mnt;
1242 : }
1243 : EXPORT_SYMBOL(mntget);
1244 :
1245 : /* path_is_mountpoint() - Check if path is a mount in the current
1246 : * namespace.
1247 : *
1248 : * d_mountpoint() can only be used reliably to establish if a dentry is
1249 : * not mounted in any namespace and that common case is handled inline.
1250 : * d_mountpoint() isn't aware of the possibility there may be multiple
1251 : * mounts using a given dentry in a different namespace. This function
1252 : * checks if the passed in path is a mountpoint rather than the dentry
1253 : * alone.
1254 : */
1255 0 : bool path_is_mountpoint(const struct path *path)
1256 : {
1257 0 : unsigned seq;
1258 0 : bool res;
1259 :
1260 0 : if (!d_mountpoint(path->dentry))
1261 : return false;
1262 :
1263 0 : rcu_read_lock();
1264 0 : do {
1265 0 : seq = read_seqbegin(&mount_lock);
1266 0 : res = __path_is_mountpoint(path);
1267 0 : } while (read_seqretry(&mount_lock, seq));
1268 0 : rcu_read_unlock();
1269 :
1270 0 : return res;
1271 : }
1272 : EXPORT_SYMBOL(path_is_mountpoint);
1273 :
1274 0 : struct vfsmount *mnt_clone_internal(const struct path *path)
1275 : {
1276 0 : struct mount *p;
1277 0 : p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1278 0 : if (IS_ERR(p))
1279 0 : return ERR_CAST(p);
1280 0 : p->mnt.mnt_flags |= MNT_INTERNAL;
1281 0 : return &p->mnt;
1282 : }
1283 :
1284 : #ifdef CONFIG_PROC_FS
1285 5640 : static struct mount *mnt_list_next(struct mnt_namespace *ns,
1286 : struct list_head *p)
1287 : {
1288 5640 : struct mount *mnt, *ret = NULL;
1289 :
1290 5640 : lock_ns_list(ns);
1291 5670 : list_for_each_continue(p, &ns->list) {
1292 5469 : mnt = list_entry(p, typeof(*mnt), mnt_list);
1293 5469 : if (!mnt_is_cursor(mnt)) {
1294 : ret = mnt;
1295 : break;
1296 : }
1297 : }
1298 5641 : unlock_ns_list(ns);
1299 :
1300 5641 : return ret;
1301 : }
1302 :
1303 : /* iterator; we want it to have access to namespace_sem, thus here... */
1304 794 : static void *m_start(struct seq_file *m, loff_t *pos)
1305 : {
1306 794 : struct proc_mounts *p = m->private;
1307 794 : struct list_head *prev;
1308 :
1309 794 : down_read(&namespace_sem);
1310 794 : if (!*pos) {
1311 209 : prev = &p->ns->list;
1312 : } else {
1313 585 : prev = &p->cursor.mnt_list;
1314 :
1315 : /* Read after we'd reached the end? */
1316 585 : if (list_empty(prev))
1317 : return NULL;
1318 : }
1319 :
1320 591 : return mnt_list_next(p->ns, prev);
1321 : }
1322 :
1323 5048 : static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1324 : {
1325 5048 : struct proc_mounts *p = m->private;
1326 5048 : struct mount *mnt = v;
1327 :
1328 5048 : ++*pos;
1329 5048 : return mnt_list_next(p->ns, &mnt->mnt_list);
1330 : }
1331 :
1332 794 : static void m_stop(struct seq_file *m, void *v)
1333 : {
1334 794 : struct proc_mounts *p = m->private;
1335 794 : struct mount *mnt = v;
1336 :
1337 794 : lock_ns_list(p->ns);
1338 794 : if (mnt)
1339 390 : list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
1340 : else
1341 404 : list_del_init(&p->cursor.mnt_list);
1342 794 : unlock_ns_list(p->ns);
1343 794 : up_read(&namespace_sem);
1344 794 : }
1345 :
1346 5050 : static int m_show(struct seq_file *m, void *v)
1347 : {
1348 5050 : struct proc_mounts *p = m->private;
1349 5050 : struct mount *r = v;
1350 5050 : return p->show(m, &r->mnt);
1351 : }
1352 :
1353 : const struct seq_operations mounts_op = {
1354 : .start = m_start,
1355 : .next = m_next,
1356 : .stop = m_stop,
1357 : .show = m_show,
1358 : };
1359 :
1360 150 : void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
1361 : {
1362 150 : down_read(&namespace_sem);
1363 150 : lock_ns_list(ns);
1364 150 : list_del(&cursor->mnt_list);
1365 150 : unlock_ns_list(ns);
1366 150 : up_read(&namespace_sem);
1367 150 : }
1368 : #endif /* CONFIG_PROC_FS */
1369 :
1370 : /**
1371 : * may_umount_tree - check if a mount tree is busy
1372 : * @mnt: root of mount tree
1373 : *
1374 : * This is called to check if a tree of mounts has any
1375 : * open files, pwds, chroots or sub mounts that are
1376 : * busy.
1377 : */
1378 0 : int may_umount_tree(struct vfsmount *m)
1379 : {
1380 0 : struct mount *mnt = real_mount(m);
1381 0 : int actual_refs = 0;
1382 0 : int minimum_refs = 0;
1383 0 : struct mount *p;
1384 0 : BUG_ON(!m);
1385 :
1386 : /* write lock needed for mnt_get_count */
1387 0 : lock_mount_hash();
1388 0 : for (p = mnt; p; p = next_mnt(p, mnt)) {
1389 0 : actual_refs += mnt_get_count(p);
1390 0 : minimum_refs += 2;
1391 : }
1392 0 : unlock_mount_hash();
1393 :
1394 0 : if (actual_refs > minimum_refs)
1395 0 : return 0;
1396 :
1397 : return 1;
1398 : }
1399 :
1400 : EXPORT_SYMBOL(may_umount_tree);
1401 :
1402 : /**
1403 : * may_umount - check if a mount point is busy
1404 : * @mnt: root of mount
1405 : *
1406 : * This is called to check if a mount point has any
1407 : * open files, pwds, chroots or sub mounts. If the
1408 : * mount has sub mounts this will return busy
1409 : * regardless of whether the sub mounts are busy.
1410 : *
1411 : * Doesn't take quota and stuff into account. IOW, in some cases it will
1412 : * give false negatives. The main reason why it's here is that we need
1413 : * a non-destructive way to look for easily umountable filesystems.
1414 : */
1415 0 : int may_umount(struct vfsmount *mnt)
1416 : {
1417 0 : int ret = 1;
1418 0 : down_read(&namespace_sem);
1419 0 : lock_mount_hash();
1420 0 : if (propagate_mount_busy(real_mount(mnt), 2))
1421 0 : ret = 0;
1422 0 : unlock_mount_hash();
1423 0 : up_read(&namespace_sem);
1424 0 : return ret;
1425 : }
1426 :
1427 : EXPORT_SYMBOL(may_umount);
1428 :
1429 423 : static void namespace_unlock(void)
1430 : {
1431 423 : struct hlist_head head;
1432 423 : struct hlist_node *p;
1433 423 : struct mount *m;
1434 423 : LIST_HEAD(list);
1435 :
1436 423 : hlist_move_list(&unmounted, &head);
1437 423 : list_splice_init(&ex_mountpoints, &list);
1438 :
1439 423 : up_write(&namespace_sem);
1440 :
1441 423 : shrink_dentry_list(&list);
1442 :
1443 423 : if (likely(hlist_empty(&head)))
1444 270 : return;
1445 :
1446 153 : synchronize_rcu_expedited();
1447 :
1448 1370 : hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1449 1064 : hlist_del(&m->mnt_umount);
1450 1064 : mntput(&m->mnt);
1451 : }
1452 : }
1453 :
1454 423 : static inline void namespace_lock(void)
1455 : {
1456 423 : down_write(&namespace_sem);
1457 : }
1458 :
1459 : enum umount_tree_flags {
1460 : UMOUNT_SYNC = 1,
1461 : UMOUNT_PROPAGATE = 2,
1462 : UMOUNT_CONNECTED = 4,
1463 : };
1464 :
1465 1064 : static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
1466 : {
1467 : /* Leaving mounts connected is only valid for lazy umounts */
1468 1064 : if (how & UMOUNT_SYNC)
1469 : return true;
1470 :
1471 : /* A mount without a parent has nothing to be connected to */
1472 729 : if (!mnt_has_parent(mnt))
1473 : return true;
1474 :
1475 : /* Because the reference counting rules change when mounts are
1476 : * unmounted and connected, umounted mounts may not be
1477 : * connected to mounted mounts.
1478 : */
1479 682 : if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1480 : return true;
1481 :
1482 : /* Has it been requested that the mount remain connected? */
1483 681 : if (how & UMOUNT_CONNECTED)
1484 : return false;
1485 :
1486 : /* Is the mount locked such that it needs to remain connected? */
1487 681 : if (IS_MNT_LOCKED(mnt))
1488 0 : return false;
1489 :
1490 : /* By default disconnect the mount */
1491 : return true;
1492 : }
1493 :
1494 : /*
1495 : * mount_lock must be held
1496 : * namespace_sem must be held for write
1497 : */
1498 153 : static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1499 : {
1500 153 : LIST_HEAD(tmp_list);
1501 153 : struct mount *p;
1502 :
1503 153 : if (how & UMOUNT_PROPAGATE)
1504 106 : propagate_mount_unlock(mnt);
1505 :
1506 : /* Gather the mounts to umount */
1507 987 : for (p = mnt; p; p = next_mnt(p, mnt)) {
1508 834 : p->mnt.mnt_flags |= MNT_UMOUNT;
1509 834 : list_move(&p->mnt_list, &tmp_list);
1510 : }
1511 :
1512 : /* Hide the mounts from mnt_mounts */
1513 987 : list_for_each_entry(p, &tmp_list, mnt_list) {
1514 834 : list_del_init(&p->mnt_child);
1515 : }
1516 :
1517 : /* Add propogated mounts to the tmp_list */
1518 153 : if (how & UMOUNT_PROPAGATE)
1519 106 : propagate_umount(&tmp_list);
1520 :
1521 1217 : while (!list_empty(&tmp_list)) {
1522 1064 : struct mnt_namespace *ns;
1523 1064 : bool disconnect;
1524 1064 : p = list_first_entry(&tmp_list, struct mount, mnt_list);
1525 1064 : list_del_init(&p->mnt_expire);
1526 1064 : list_del_init(&p->mnt_list);
1527 1064 : ns = p->mnt_ns;
1528 1064 : if (ns) {
1529 1064 : ns->mounts--;
1530 1064 : __touch_mnt_namespace(ns);
1531 : }
1532 1064 : p->mnt_ns = NULL;
1533 1064 : if (how & UMOUNT_SYNC)
1534 335 : p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1535 :
1536 1064 : disconnect = disconnect_mount(p, how);
1537 1064 : if (mnt_has_parent(p)) {
1538 1017 : mnt_add_count(p->mnt_parent, -1);
1539 1017 : if (!disconnect) {
1540 : /* Don't forget about p */
1541 0 : list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1542 : } else {
1543 1017 : umount_mnt(p);
1544 : }
1545 : }
1546 1064 : change_mnt_propagation(p, MS_PRIVATE);
1547 1064 : if (disconnect)
1548 2281 : hlist_add_head(&p->mnt_umount, &unmounted);
1549 : }
1550 153 : }
1551 :
1552 : static void shrink_submounts(struct mount *mnt);
1553 :
1554 0 : static int do_umount_root(struct super_block *sb)
1555 : {
1556 0 : int ret = 0;
1557 :
1558 0 : down_write(&sb->s_umount);
1559 0 : if (!sb_rdonly(sb)) {
1560 0 : struct fs_context *fc;
1561 :
1562 0 : fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
1563 : SB_RDONLY);
1564 0 : if (IS_ERR(fc)) {
1565 0 : ret = PTR_ERR(fc);
1566 : } else {
1567 0 : ret = parse_monolithic_mount_data(fc, NULL);
1568 0 : if (!ret)
1569 0 : ret = reconfigure_super(fc);
1570 0 : put_fs_context(fc);
1571 : }
1572 : }
1573 0 : up_write(&sb->s_umount);
1574 0 : return ret;
1575 : }
1576 :
1577 112 : static int do_umount(struct mount *mnt, int flags)
1578 : {
1579 112 : struct super_block *sb = mnt->mnt.mnt_sb;
1580 112 : int retval;
1581 :
1582 112 : retval = security_sb_umount(&mnt->mnt, flags);
1583 112 : if (retval)
1584 : return retval;
1585 :
1586 : /*
1587 : * Allow userspace to request a mountpoint be expired rather than
1588 : * unmounting unconditionally. Unmount only happens if:
1589 : * (1) the mark is already set (the mark is cleared by mntput())
1590 : * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1591 : */
1592 112 : if (flags & MNT_EXPIRE) {
1593 0 : if (&mnt->mnt == current->fs->root.mnt ||
1594 0 : flags & (MNT_FORCE | MNT_DETACH))
1595 : return -EINVAL;
1596 :
1597 : /*
1598 : * probably don't strictly need the lock here if we examined
1599 : * all race cases, but it's a slowpath.
1600 : */
1601 0 : lock_mount_hash();
1602 0 : if (mnt_get_count(mnt) != 2) {
1603 0 : unlock_mount_hash();
1604 0 : return -EBUSY;
1605 : }
1606 0 : unlock_mount_hash();
1607 :
1608 0 : if (!xchg(&mnt->mnt_expiry_mark, 1))
1609 : return -EAGAIN;
1610 : }
1611 :
1612 : /*
1613 : * If we may have to abort operations to get out of this
1614 : * mount, and they will themselves hold resources we must
1615 : * allow the fs to do things. In the Unix tradition of
1616 : * 'Gee thats tricky lets do it in userspace' the umount_begin
1617 : * might fail to complete on the first run through as other tasks
1618 : * must return, and the like. Thats for the mount program to worry
1619 : * about for the moment.
1620 : */
1621 :
1622 112 : if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1623 0 : sb->s_op->umount_begin(sb);
1624 : }
1625 :
1626 : /*
1627 : * No sense to grab the lock for this test, but test itself looks
1628 : * somewhat bogus. Suggestions for better replacement?
1629 : * Ho-hum... In principle, we might treat that as umount + switch
1630 : * to rootfs. GC would eventually take care of the old vfsmount.
1631 : * Actually it makes sense, especially if rootfs would contain a
1632 : * /reboot - static binary that would close all descriptors and
1633 : * call reboot(9). Then init(8) could umount root and exec /reboot.
1634 : */
1635 112 : if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1636 : /*
1637 : * Special case for "unmounting" root ...
1638 : * we just try to remount it readonly.
1639 : */
1640 0 : if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
1641 : return -EPERM;
1642 0 : return do_umount_root(sb);
1643 : }
1644 :
1645 112 : namespace_lock();
1646 112 : lock_mount_hash();
1647 :
1648 : /* Recheck MNT_LOCKED with the locks held */
1649 112 : retval = -EINVAL;
1650 112 : if (mnt->mnt.mnt_flags & MNT_LOCKED)
1651 0 : goto out;
1652 :
1653 112 : event++;
1654 112 : if (flags & MNT_DETACH) {
1655 1 : if (!list_empty(&mnt->mnt_list))
1656 1 : umount_tree(mnt, UMOUNT_PROPAGATE);
1657 : retval = 0;
1658 : } else {
1659 111 : shrink_submounts(mnt);
1660 111 : retval = -EBUSY;
1661 111 : if (!propagate_mount_busy(mnt, 2)) {
1662 105 : if (!list_empty(&mnt->mnt_list))
1663 105 : umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
1664 : retval = 0;
1665 : }
1666 : }
1667 6 : out:
1668 112 : unlock_mount_hash();
1669 112 : namespace_unlock();
1670 112 : return retval;
1671 : }
1672 :
1673 : /*
1674 : * __detach_mounts - lazily unmount all mounts on the specified dentry
1675 : *
1676 : * During unlink, rmdir, and d_drop it is possible to loose the path
1677 : * to an existing mountpoint, and wind up leaking the mount.
1678 : * detach_mounts allows lazily unmounting those mounts instead of
1679 : * leaking them.
1680 : *
1681 : * The caller may hold dentry->d_inode->i_mutex.
1682 : */
1683 0 : void __detach_mounts(struct dentry *dentry)
1684 : {
1685 0 : struct mountpoint *mp;
1686 0 : struct mount *mnt;
1687 :
1688 0 : namespace_lock();
1689 0 : lock_mount_hash();
1690 0 : mp = lookup_mountpoint(dentry);
1691 0 : if (!mp)
1692 0 : goto out_unlock;
1693 :
1694 0 : event++;
1695 0 : while (!hlist_empty(&mp->m_list)) {
1696 0 : mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1697 0 : if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1698 0 : umount_mnt(mnt);
1699 0 : hlist_add_head(&mnt->mnt_umount, &unmounted);
1700 : }
1701 0 : else umount_tree(mnt, UMOUNT_CONNECTED);
1702 : }
1703 0 : put_mountpoint(mp);
1704 0 : out_unlock:
1705 0 : unlock_mount_hash();
1706 0 : namespace_unlock();
1707 0 : }
1708 :
1709 : /*
1710 : * Is the caller allowed to modify his namespace?
1711 : */
1712 368 : static inline bool may_mount(void)
1713 : {
1714 368 : return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1715 : }
1716 :
1717 : #ifdef CONFIG_MANDATORY_FILE_LOCKING
1718 : static inline bool may_mandlock(void)
1719 : {
1720 : return capable(CAP_SYS_ADMIN);
1721 : }
1722 : #else
1723 0 : static inline bool may_mandlock(void)
1724 : {
1725 0 : pr_warn("VFS: \"mand\" mount option not supported");
1726 0 : return false;
1727 : }
1728 : #endif
1729 :
1730 114 : static int can_umount(const struct path *path, int flags)
1731 : {
1732 114 : struct mount *mnt = real_mount(path->mnt);
1733 :
1734 114 : if (!may_mount())
1735 : return -EPERM;
1736 114 : if (path->dentry != path->mnt->mnt_root)
1737 : return -EINVAL;
1738 112 : if (!check_mnt(mnt))
1739 : return -EINVAL;
1740 112 : if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1741 : return -EINVAL;
1742 112 : if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1743 0 : return -EPERM;
1744 : return 0;
1745 : }
1746 :
1747 : // caller is responsible for flags being sane
1748 114 : int path_umount(struct path *path, int flags)
1749 : {
1750 114 : struct mount *mnt = real_mount(path->mnt);
1751 114 : int ret;
1752 :
1753 114 : ret = can_umount(path, flags);
1754 114 : if (!ret)
1755 112 : ret = do_umount(mnt, flags);
1756 :
1757 : /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1758 114 : dput(path->dentry);
1759 114 : mntput_no_expire(mnt);
1760 114 : return ret;
1761 : }
1762 :
1763 114 : static int ksys_umount(char __user *name, int flags)
1764 : {
1765 114 : int lookup_flags = LOOKUP_MOUNTPOINT;
1766 114 : struct path path;
1767 114 : int ret;
1768 :
1769 : // basic validity checks done first
1770 114 : if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1771 : return -EINVAL;
1772 :
1773 114 : if (!(flags & UMOUNT_NOFOLLOW))
1774 114 : lookup_flags |= LOOKUP_FOLLOW;
1775 114 : ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1776 114 : if (ret)
1777 : return ret;
1778 114 : return path_umount(&path, flags);
1779 : }
1780 :
1781 228 : SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1782 : {
1783 114 : return ksys_umount(name, flags);
1784 : }
1785 :
1786 : #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1787 :
1788 : /*
1789 : * The 2.0 compatible umount. No flags.
1790 : */
1791 0 : SYSCALL_DEFINE1(oldumount, char __user *, name)
1792 : {
1793 0 : return ksys_umount(name, 0);
1794 : }
1795 :
1796 : #endif
1797 :
1798 1081 : static bool is_mnt_ns_file(struct dentry *dentry)
1799 : {
1800 : /* Is this a proxy for a mount namespace? */
1801 284 : return dentry->d_op == &ns_dentry_operations &&
1802 0 : dentry->d_fsdata == &mntns_operations;
1803 : }
1804 :
1805 6 : static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
1806 : {
1807 6 : return container_of(ns, struct mnt_namespace, ns);
1808 : }
1809 :
1810 0 : struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
1811 : {
1812 0 : return &mnt->ns;
1813 : }
1814 :
1815 114 : static bool mnt_ns_loop(struct dentry *dentry)
1816 : {
1817 : /* Could bind mounting the mount namespace inode cause a
1818 : * mount namespace loop?
1819 : */
1820 114 : struct mnt_namespace *mnt_ns;
1821 228 : if (!is_mnt_ns_file(dentry))
1822 : return false;
1823 :
1824 0 : mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
1825 0 : return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1826 : }
1827 :
1828 313 : struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1829 : int flag)
1830 : {
1831 313 : struct mount *res, *p, *q, *r, *parent;
1832 :
1833 313 : if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1834 313 : return ERR_PTR(-EINVAL);
1835 :
1836 597 : if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1837 313 : return ERR_PTR(-EINVAL);
1838 :
1839 313 : res = q = clone_mnt(mnt, dentry, flag);
1840 313 : if (IS_ERR(q))
1841 : return q;
1842 :
1843 313 : q->mnt_mountpoint = mnt->mnt_mountpoint;
1844 :
1845 313 : p = mnt;
1846 478 : list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1847 165 : struct mount *s;
1848 165 : if (!is_subdir(r->mnt_mountpoint, dentry))
1849 104 : continue;
1850 :
1851 1499 : for (s = r; s; s = next_mnt(s, r)) {
1852 719 : if (!(flag & CL_COPY_UNBINDABLE) &&
1853 36 : IS_MNT_UNBINDABLE(s)) {
1854 0 : if (s->mnt.mnt_flags & MNT_LOCKED) {
1855 : /* Both unbindable and locked. */
1856 0 : q = ERR_PTR(-EPERM);
1857 0 : goto out;
1858 : } else {
1859 0 : s = skip_mnt_tree(s);
1860 0 : continue;
1861 : }
1862 : }
1863 719 : if (!(flag & CL_COPY_MNT_NS_FILE) &&
1864 1366 : is_mnt_ns_file(s->mnt.mnt_root)) {
1865 0 : s = skip_mnt_tree(s);
1866 0 : continue;
1867 : }
1868 1285 : while (p != s->mnt_parent) {
1869 566 : p = p->mnt_parent;
1870 566 : q = q->mnt_parent;
1871 : }
1872 719 : p = s;
1873 719 : parent = q;
1874 719 : q = clone_mnt(p, p->mnt.mnt_root, flag);
1875 719 : if (IS_ERR(q))
1876 0 : goto out;
1877 719 : lock_mount_hash();
1878 719 : list_add_tail(&q->mnt_list, &res->mnt_list);
1879 719 : attach_mnt(q, parent, p->mnt_mp);
1880 719 : unlock_mount_hash();
1881 : }
1882 : }
1883 : return res;
1884 0 : out:
1885 0 : if (res) {
1886 0 : lock_mount_hash();
1887 0 : umount_tree(res, UMOUNT_SYNC);
1888 0 : unlock_mount_hash();
1889 : }
1890 : return q;
1891 : }
1892 :
1893 : /* Caller should check returned pointer for errors */
1894 :
1895 0 : struct vfsmount *collect_mounts(const struct path *path)
1896 : {
1897 0 : struct mount *tree;
1898 0 : namespace_lock();
1899 0 : if (!check_mnt(real_mount(path->mnt)))
1900 0 : tree = ERR_PTR(-EINVAL);
1901 : else
1902 0 : tree = copy_tree(real_mount(path->mnt), path->dentry,
1903 : CL_COPY_ALL | CL_PRIVATE);
1904 0 : namespace_unlock();
1905 0 : if (IS_ERR(tree))
1906 0 : return ERR_CAST(tree);
1907 0 : return &tree->mnt;
1908 : }
1909 :
1910 : static void free_mnt_ns(struct mnt_namespace *);
1911 : static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
1912 :
1913 0 : void dissolve_on_fput(struct vfsmount *mnt)
1914 : {
1915 0 : struct mnt_namespace *ns;
1916 0 : namespace_lock();
1917 0 : lock_mount_hash();
1918 0 : ns = real_mount(mnt)->mnt_ns;
1919 0 : if (ns) {
1920 0 : if (is_anon_ns(ns))
1921 0 : umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
1922 : else
1923 : ns = NULL;
1924 : }
1925 0 : unlock_mount_hash();
1926 0 : namespace_unlock();
1927 0 : if (ns)
1928 0 : free_mnt_ns(ns);
1929 0 : }
1930 :
1931 47 : void drop_collected_mounts(struct vfsmount *mnt)
1932 : {
1933 47 : namespace_lock();
1934 47 : lock_mount_hash();
1935 47 : umount_tree(real_mount(mnt), 0);
1936 47 : unlock_mount_hash();
1937 47 : namespace_unlock();
1938 47 : }
1939 :
1940 : /**
1941 : * clone_private_mount - create a private clone of a path
1942 : *
1943 : * This creates a new vfsmount, which will be the clone of @path. The new will
1944 : * not be attached anywhere in the namespace and will be private (i.e. changes
1945 : * to the originating mount won't be propagated into this).
1946 : *
1947 : * Release with mntput().
1948 : */
1949 4 : struct vfsmount *clone_private_mount(const struct path *path)
1950 : {
1951 4 : struct mount *old_mnt = real_mount(path->mnt);
1952 4 : struct mount *new_mnt;
1953 :
1954 4 : if (IS_MNT_UNBINDABLE(old_mnt))
1955 4 : return ERR_PTR(-EINVAL);
1956 :
1957 4 : new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
1958 4 : if (IS_ERR(new_mnt))
1959 4 : return ERR_CAST(new_mnt);
1960 :
1961 : /* Longterm mount to be removed by kern_unmount*() */
1962 4 : new_mnt->mnt_ns = MNT_NS_INTERNAL;
1963 :
1964 4 : return &new_mnt->mnt;
1965 : }
1966 : EXPORT_SYMBOL_GPL(clone_private_mount);
1967 :
1968 0 : int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1969 : struct vfsmount *root)
1970 : {
1971 0 : struct mount *mnt;
1972 0 : int res = f(root, arg);
1973 0 : if (res)
1974 : return res;
1975 0 : list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
1976 0 : res = f(&mnt->mnt, arg);
1977 0 : if (res)
1978 0 : return res;
1979 : }
1980 : return 0;
1981 : }
1982 :
1983 0 : static void lock_mnt_tree(struct mount *mnt)
1984 : {
1985 0 : struct mount *p;
1986 :
1987 0 : for (p = mnt; p; p = next_mnt(p, mnt)) {
1988 0 : int flags = p->mnt.mnt_flags;
1989 : /* Don't allow unprivileged users to change mount flags */
1990 0 : flags |= MNT_LOCK_ATIME;
1991 :
1992 0 : if (flags & MNT_READONLY)
1993 0 : flags |= MNT_LOCK_READONLY;
1994 :
1995 0 : if (flags & MNT_NODEV)
1996 0 : flags |= MNT_LOCK_NODEV;
1997 :
1998 0 : if (flags & MNT_NOSUID)
1999 0 : flags |= MNT_LOCK_NOSUID;
2000 :
2001 0 : if (flags & MNT_NOEXEC)
2002 0 : flags |= MNT_LOCK_NOEXEC;
2003 : /* Don't allow unprivileged users to reveal what is under a mount */
2004 0 : if (list_empty(&p->mnt_expire))
2005 0 : flags |= MNT_LOCKED;
2006 0 : p->mnt.mnt_flags = flags;
2007 : }
2008 0 : }
2009 :
2010 0 : static void cleanup_group_ids(struct mount *mnt, struct mount *end)
2011 : {
2012 0 : struct mount *p;
2013 :
2014 0 : for (p = mnt; p != end; p = next_mnt(p, mnt)) {
2015 0 : if (p->mnt_group_id && !IS_MNT_SHARED(p))
2016 0 : mnt_release_group_id(p);
2017 : }
2018 0 : }
2019 :
2020 53 : static int invent_group_ids(struct mount *mnt, bool recurse)
2021 : {
2022 53 : struct mount *p;
2023 :
2024 303 : for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
2025 125 : if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
2026 125 : int err = mnt_alloc_group_id(p);
2027 125 : if (err) {
2028 0 : cleanup_group_ids(mnt, p);
2029 0 : return err;
2030 : }
2031 : }
2032 : }
2033 :
2034 : return 0;
2035 : }
2036 :
2037 385 : int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
2038 : {
2039 385 : unsigned int max = READ_ONCE(sysctl_mount_max);
2040 385 : unsigned int mounts = 0, old, pending, sum;
2041 385 : struct mount *p;
2042 :
2043 806 : for (p = mnt; p; p = next_mnt(p, mnt))
2044 421 : mounts++;
2045 :
2046 385 : old = ns->mounts;
2047 385 : pending = ns->pending_mounts;
2048 385 : sum = old + pending;
2049 385 : if ((old > sum) ||
2050 385 : (pending > sum) ||
2051 385 : (max < sum) ||
2052 385 : (mounts > (max - sum)))
2053 : return -ENOSPC;
2054 :
2055 385 : ns->pending_mounts = pending + mounts;
2056 385 : return 0;
2057 : }
2058 :
2059 : /*
2060 : * @source_mnt : mount tree to be attached
2061 : * @nd : place the mount tree @source_mnt is attached
2062 : * @parent_nd : if non-null, detach the source_mnt from its parent and
2063 : * store the parent mount and mountpoint dentry.
2064 : * (done when source_mnt is moved)
2065 : *
2066 : * NOTE: in the table below explains the semantics when a source mount
2067 : * of a given type is attached to a destination mount of a given type.
2068 : * ---------------------------------------------------------------------------
2069 : * | BIND MOUNT OPERATION |
2070 : * |**************************************************************************
2071 : * | source-->| shared | private | slave | unbindable |
2072 : * | dest | | | | |
2073 : * | | | | | | |
2074 : * | v | | | | |
2075 : * |**************************************************************************
2076 : * | shared | shared (++) | shared (+) | shared(+++)| invalid |
2077 : * | | | | | |
2078 : * |non-shared| shared (+) | private | slave (*) | invalid |
2079 : * ***************************************************************************
2080 : * A bind operation clones the source mount and mounts the clone on the
2081 : * destination mount.
2082 : *
2083 : * (++) the cloned mount is propagated to all the mounts in the propagation
2084 : * tree of the destination mount and the cloned mount is added to
2085 : * the peer group of the source mount.
2086 : * (+) the cloned mount is created under the destination mount and is marked
2087 : * as shared. The cloned mount is added to the peer group of the source
2088 : * mount.
2089 : * (+++) the mount is propagated to all the mounts in the propagation tree
2090 : * of the destination mount and the cloned mount is made slave
2091 : * of the same master as that of the source mount. The cloned mount
2092 : * is marked as 'shared and slave'.
2093 : * (*) the cloned mount is made a slave of the same master as that of the
2094 : * source mount.
2095 : *
2096 : * ---------------------------------------------------------------------------
2097 : * | MOVE MOUNT OPERATION |
2098 : * |**************************************************************************
2099 : * | source-->| shared | private | slave | unbindable |
2100 : * | dest | | | | |
2101 : * | | | | | | |
2102 : * | v | | | | |
2103 : * |**************************************************************************
2104 : * | shared | shared (+) | shared (+) | shared(+++) | invalid |
2105 : * | | | | | |
2106 : * |non-shared| shared (+*) | private | slave (*) | unbindable |
2107 : * ***************************************************************************
2108 : *
2109 : * (+) the mount is moved to the destination. And is then propagated to
2110 : * all the mounts in the propagation tree of the destination mount.
2111 : * (+*) the mount is moved to the destination.
2112 : * (+++) the mount is moved to the destination and is then propagated to
2113 : * all the mounts belonging to the destination mount's propagation tree.
2114 : * the mount is marked as 'shared and slave'.
2115 : * (*) the mount continues to be a slave at the new location.
2116 : *
2117 : * if the source mount is a tree, the operations explained above is
2118 : * applied to each mount in the tree.
2119 : * Must be called without spinlocks held, since this function can sleep
2120 : * in allocations.
2121 : */
2122 159 : static int attach_recursive_mnt(struct mount *source_mnt,
2123 : struct mount *dest_mnt,
2124 : struct mountpoint *dest_mp,
2125 : bool moving)
2126 : {
2127 159 : struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2128 159 : HLIST_HEAD(tree_list);
2129 159 : struct mnt_namespace *ns = dest_mnt->mnt_ns;
2130 159 : struct mountpoint *smp;
2131 159 : struct mount *child, *p;
2132 159 : struct hlist_node *n;
2133 159 : int err;
2134 :
2135 : /* Preallocate a mountpoint in case the new mounts need
2136 : * to be tucked under other mounts.
2137 : */
2138 159 : smp = get_mountpoint(source_mnt->mnt.mnt_root);
2139 159 : if (IS_ERR(smp))
2140 0 : return PTR_ERR(smp);
2141 :
2142 : /* Is there space to add these mounts to the mount namespace? */
2143 159 : if (!moving) {
2144 151 : err = count_mounts(ns, source_mnt);
2145 151 : if (err)
2146 0 : goto out;
2147 : }
2148 :
2149 159 : if (IS_MNT_SHARED(dest_mnt)) {
2150 49 : err = invent_group_ids(source_mnt, true);
2151 49 : if (err)
2152 0 : goto out;
2153 49 : err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2154 49 : lock_mount_hash();
2155 49 : if (err)
2156 0 : goto out_cleanup_ids;
2157 98 : for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2158 49 : set_mnt_shared(p);
2159 : } else {
2160 110 : lock_mount_hash();
2161 : }
2162 159 : if (moving) {
2163 8 : unhash_mnt(source_mnt);
2164 8 : attach_mnt(source_mnt, dest_mnt, dest_mp);
2165 8 : touch_mnt_namespace(source_mnt->mnt_ns);
2166 : } else {
2167 151 : if (source_mnt->mnt_ns) {
2168 : /* move from anon - the caller will destroy */
2169 0 : list_del_init(&source_mnt->mnt_ns->list);
2170 : }
2171 151 : mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2172 151 : commit_tree(source_mnt);
2173 : }
2174 :
2175 552 : hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2176 234 : struct mount *q;
2177 234 : hlist_del_init(&child->mnt_hash);
2178 234 : q = __lookup_mnt(&child->mnt_parent->mnt,
2179 : child->mnt_mountpoint);
2180 234 : if (q)
2181 0 : mnt_change_mountpoint(child, smp, q);
2182 : /* Notice when we are propagating across user namespaces */
2183 234 : if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2184 0 : lock_mnt_tree(child);
2185 234 : child->mnt.mnt_flags &= ~MNT_LOCKED;
2186 234 : commit_tree(child);
2187 : }
2188 159 : put_mountpoint(smp);
2189 159 : unlock_mount_hash();
2190 :
2191 159 : return 0;
2192 :
2193 0 : out_cleanup_ids:
2194 0 : while (!hlist_empty(&tree_list)) {
2195 0 : child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2196 0 : child->mnt_parent->mnt_ns->pending_mounts = 0;
2197 0 : umount_tree(child, UMOUNT_SYNC);
2198 : }
2199 0 : unlock_mount_hash();
2200 0 : cleanup_group_ids(source_mnt, NULL);
2201 0 : out:
2202 0 : ns->pending_mounts = 0;
2203 :
2204 0 : read_seqlock_excl(&mount_lock);
2205 0 : put_mountpoint(smp);
2206 0 : read_sequnlock_excl(&mount_lock);
2207 :
2208 0 : return err;
2209 : }
2210 :
2211 161 : static struct mountpoint *lock_mount(struct path *path)
2212 : {
2213 161 : struct vfsmount *mnt;
2214 161 : struct dentry *dentry = path->dentry;
2215 161 : retry:
2216 161 : inode_lock(dentry->d_inode);
2217 161 : if (unlikely(cant_mount(dentry))) {
2218 0 : inode_unlock(dentry->d_inode);
2219 0 : return ERR_PTR(-ENOENT);
2220 : }
2221 161 : namespace_lock();
2222 161 : mnt = lookup_mnt(path);
2223 161 : if (likely(!mnt)) {
2224 161 : struct mountpoint *mp = get_mountpoint(dentry);
2225 161 : if (IS_ERR(mp)) {
2226 0 : namespace_unlock();
2227 0 : inode_unlock(dentry->d_inode);
2228 0 : return mp;
2229 : }
2230 : return mp;
2231 : }
2232 0 : namespace_unlock();
2233 0 : inode_unlock(path->dentry->d_inode);
2234 0 : path_put(path);
2235 0 : path->mnt = mnt;
2236 0 : dentry = path->dentry = dget(mnt->mnt_root);
2237 0 : goto retry;
2238 : }
2239 :
2240 161 : static void unlock_mount(struct mountpoint *where)
2241 : {
2242 161 : struct dentry *dentry = where->m_dentry;
2243 :
2244 161 : read_seqlock_excl(&mount_lock);
2245 161 : put_mountpoint(where);
2246 161 : read_sequnlock_excl(&mount_lock);
2247 :
2248 161 : namespace_unlock();
2249 161 : inode_unlock(dentry->d_inode);
2250 161 : }
2251 :
2252 151 : static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
2253 : {
2254 151 : if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2255 : return -EINVAL;
2256 :
2257 151 : if (d_is_dir(mp->m_dentry) !=
2258 154 : d_is_dir(mnt->mnt.mnt_root))
2259 : return -ENOTDIR;
2260 :
2261 151 : return attach_recursive_mnt(mnt, p, mp, false);
2262 : }
2263 :
2264 : /*
2265 : * Sanity check the flags to change_mnt_propagation.
2266 : */
2267 :
2268 53 : static int flags_to_propagation_type(int ms_flags)
2269 : {
2270 53 : int type = ms_flags & ~(MS_REC | MS_SILENT);
2271 :
2272 : /* Fail if any non-propagation flags are set */
2273 53 : if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2274 : return 0;
2275 : /* Only one propagation flag should be set */
2276 106 : if (!is_power_of_2(type))
2277 0 : return 0;
2278 : return type;
2279 : }
2280 :
2281 : /*
2282 : * recursively change the type of the mountpoint.
2283 : */
2284 53 : static int do_change_type(struct path *path, int ms_flags)
2285 : {
2286 53 : struct mount *m;
2287 53 : struct mount *mnt = real_mount(path->mnt);
2288 53 : int recurse = ms_flags & MS_REC;
2289 53 : int type;
2290 53 : int err = 0;
2291 :
2292 53 : if (path->dentry != path->mnt->mnt_root)
2293 : return -EINVAL;
2294 :
2295 53 : type = flags_to_propagation_type(ms_flags);
2296 53 : if (!type)
2297 : return -EINVAL;
2298 :
2299 53 : namespace_lock();
2300 53 : if (type == MS_SHARED) {
2301 4 : err = invent_group_ids(mnt, recurse);
2302 4 : if (err)
2303 0 : goto out_unlock;
2304 : }
2305 :
2306 53 : lock_mount_hash();
2307 428 : for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
2308 161 : change_mnt_propagation(m, type);
2309 53 : unlock_mount_hash();
2310 :
2311 53 : out_unlock:
2312 53 : namespace_unlock();
2313 53 : return err;
2314 : }
2315 :
2316 8 : static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
2317 : {
2318 8 : struct mount *child;
2319 10 : list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2320 2 : if (!is_subdir(child->mnt_mountpoint, dentry))
2321 2 : continue;
2322 :
2323 0 : if (child->mnt.mnt_flags & MNT_LOCKED)
2324 : return true;
2325 : }
2326 : return false;
2327 : }
2328 :
2329 37 : static struct mount *__do_loopback(struct path *old_path, int recurse)
2330 : {
2331 37 : struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
2332 :
2333 37 : if (IS_MNT_UNBINDABLE(old))
2334 : return mnt;
2335 :
2336 37 : if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
2337 : return mnt;
2338 :
2339 37 : if (!recurse && has_locked_children(old, old_path->dentry))
2340 : return mnt;
2341 :
2342 37 : if (recurse)
2343 29 : mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
2344 : else
2345 8 : mnt = clone_mnt(old, old_path->dentry, 0);
2346 :
2347 37 : if (!IS_ERR(mnt))
2348 37 : mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2349 :
2350 : return mnt;
2351 : }
2352 :
2353 : /*
2354 : * do loopback mount.
2355 : */
2356 39 : static int do_loopback(struct path *path, const char *old_name,
2357 : int recurse)
2358 : {
2359 39 : struct path old_path;
2360 39 : struct mount *mnt = NULL, *parent;
2361 39 : struct mountpoint *mp;
2362 39 : int err;
2363 39 : if (!old_name || !*old_name)
2364 : return -EINVAL;
2365 39 : err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2366 39 : if (err)
2367 : return err;
2368 :
2369 37 : err = -EINVAL;
2370 37 : if (mnt_ns_loop(old_path.dentry))
2371 0 : goto out;
2372 :
2373 37 : mp = lock_mount(path);
2374 37 : if (IS_ERR(mp)) {
2375 0 : err = PTR_ERR(mp);
2376 0 : goto out;
2377 : }
2378 :
2379 37 : parent = real_mount(path->mnt);
2380 37 : if (!check_mnt(parent))
2381 0 : goto out2;
2382 :
2383 37 : mnt = __do_loopback(&old_path, recurse);
2384 37 : if (IS_ERR(mnt)) {
2385 0 : err = PTR_ERR(mnt);
2386 0 : goto out2;
2387 : }
2388 :
2389 37 : err = graft_tree(mnt, parent, mp);
2390 37 : if (err) {
2391 0 : lock_mount_hash();
2392 0 : umount_tree(mnt, UMOUNT_SYNC);
2393 0 : unlock_mount_hash();
2394 : }
2395 37 : out2:
2396 37 : unlock_mount(mp);
2397 37 : out:
2398 37 : path_put(&old_path);
2399 37 : return err;
2400 : }
2401 :
2402 0 : static struct file *open_detached_copy(struct path *path, bool recursive)
2403 : {
2404 0 : struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2405 0 : struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
2406 0 : struct mount *mnt, *p;
2407 0 : struct file *file;
2408 :
2409 0 : if (IS_ERR(ns))
2410 0 : return ERR_CAST(ns);
2411 :
2412 0 : namespace_lock();
2413 0 : mnt = __do_loopback(path, recursive);
2414 0 : if (IS_ERR(mnt)) {
2415 0 : namespace_unlock();
2416 0 : free_mnt_ns(ns);
2417 0 : return ERR_CAST(mnt);
2418 : }
2419 :
2420 0 : lock_mount_hash();
2421 0 : for (p = mnt; p; p = next_mnt(p, mnt)) {
2422 0 : p->mnt_ns = ns;
2423 0 : ns->mounts++;
2424 : }
2425 0 : ns->root = mnt;
2426 0 : list_add_tail(&ns->list, &mnt->mnt_list);
2427 0 : mntget(&mnt->mnt);
2428 0 : unlock_mount_hash();
2429 0 : namespace_unlock();
2430 :
2431 0 : mntput(path->mnt);
2432 0 : path->mnt = &mnt->mnt;
2433 0 : file = dentry_open(path, O_PATH, current_cred());
2434 0 : if (IS_ERR(file))
2435 0 : dissolve_on_fput(path->mnt);
2436 : else
2437 0 : file->f_mode |= FMODE_NEED_UNMOUNT;
2438 : return file;
2439 : }
2440 :
2441 0 : SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
2442 : {
2443 0 : struct file *file;
2444 0 : struct path path;
2445 0 : int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
2446 0 : bool detached = flags & OPEN_TREE_CLONE;
2447 0 : int error;
2448 0 : int fd;
2449 :
2450 0 : BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
2451 :
2452 0 : if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
2453 : AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
2454 : OPEN_TREE_CLOEXEC))
2455 : return -EINVAL;
2456 :
2457 0 : if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
2458 : return -EINVAL;
2459 :
2460 0 : if (flags & AT_NO_AUTOMOUNT)
2461 0 : lookup_flags &= ~LOOKUP_AUTOMOUNT;
2462 0 : if (flags & AT_SYMLINK_NOFOLLOW)
2463 0 : lookup_flags &= ~LOOKUP_FOLLOW;
2464 0 : if (flags & AT_EMPTY_PATH)
2465 0 : lookup_flags |= LOOKUP_EMPTY;
2466 :
2467 0 : if (detached && !may_mount())
2468 : return -EPERM;
2469 :
2470 0 : fd = get_unused_fd_flags(flags & O_CLOEXEC);
2471 0 : if (fd < 0)
2472 0 : return fd;
2473 :
2474 0 : error = user_path_at(dfd, filename, lookup_flags, &path);
2475 0 : if (unlikely(error)) {
2476 0 : file = ERR_PTR(error);
2477 : } else {
2478 0 : if (detached)
2479 0 : file = open_detached_copy(&path, flags & AT_RECURSIVE);
2480 : else
2481 0 : file = dentry_open(&path, O_PATH, current_cred());
2482 0 : path_put(&path);
2483 : }
2484 0 : if (IS_ERR(file)) {
2485 0 : put_unused_fd(fd);
2486 0 : return PTR_ERR(file);
2487 : }
2488 0 : fd_install(fd, file);
2489 0 : return fd;
2490 : }
2491 :
2492 : /*
2493 : * Don't allow locked mount flags to be cleared.
2494 : *
2495 : * No locks need to be held here while testing the various MNT_LOCK
2496 : * flags because those flags can never be cleared once they are set.
2497 : */
2498 35 : static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
2499 : {
2500 35 : unsigned int fl = mnt->mnt.mnt_flags;
2501 :
2502 35 : if ((fl & MNT_LOCK_READONLY) &&
2503 0 : !(mnt_flags & MNT_READONLY))
2504 : return false;
2505 :
2506 35 : if ((fl & MNT_LOCK_NODEV) &&
2507 0 : !(mnt_flags & MNT_NODEV))
2508 : return false;
2509 :
2510 35 : if ((fl & MNT_LOCK_NOSUID) &&
2511 0 : !(mnt_flags & MNT_NOSUID))
2512 : return false;
2513 :
2514 35 : if ((fl & MNT_LOCK_NOEXEC) &&
2515 0 : !(mnt_flags & MNT_NOEXEC))
2516 : return false;
2517 :
2518 35 : if ((fl & MNT_LOCK_ATIME) &&
2519 0 : ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
2520 0 : return false;
2521 :
2522 : return true;
2523 : }
2524 :
2525 33 : static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
2526 : {
2527 33 : bool readonly_request = (mnt_flags & MNT_READONLY);
2528 :
2529 64 : if (readonly_request == __mnt_is_readonly(&mnt->mnt))
2530 : return 0;
2531 :
2532 31 : if (readonly_request)
2533 31 : return mnt_make_readonly(mnt);
2534 :
2535 0 : mnt->mnt.mnt_flags &= ~MNT_READONLY;
2536 0 : return 0;
2537 : }
2538 :
2539 35 : static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
2540 : {
2541 35 : mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2542 35 : mnt->mnt.mnt_flags = mnt_flags;
2543 35 : touch_mnt_namespace(mnt->mnt_ns);
2544 33 : }
2545 :
2546 150 : static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
2547 : {
2548 150 : struct super_block *sb = mnt->mnt_sb;
2549 :
2550 266 : if (!__mnt_is_readonly(mnt) &&
2551 115 : (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
2552 0 : char *buf = (char *)__get_free_page(GFP_KERNEL);
2553 0 : char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
2554 0 : struct tm tm;
2555 :
2556 0 : time64_to_tm(sb->s_time_max, 0, &tm);
2557 :
2558 0 : pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
2559 : sb->s_type->name,
2560 : is_mounted(mnt) ? "remounted" : "mounted",
2561 : mntpath,
2562 : tm.tm_year+1900, (unsigned long long)sb->s_time_max);
2563 :
2564 0 : free_page((unsigned long)buf);
2565 : }
2566 150 : }
2567 :
2568 : /*
2569 : * Handle reconfiguration of the mountpoint only without alteration of the
2570 : * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
2571 : * to mount(2).
2572 : */
2573 33 : static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
2574 : {
2575 33 : struct super_block *sb = path->mnt->mnt_sb;
2576 33 : struct mount *mnt = real_mount(path->mnt);
2577 33 : int ret;
2578 :
2579 33 : if (!check_mnt(mnt))
2580 : return -EINVAL;
2581 :
2582 33 : if (path->dentry != mnt->mnt.mnt_root)
2583 : return -EINVAL;
2584 :
2585 33 : if (!can_change_locked_flags(mnt, mnt_flags))
2586 : return -EPERM;
2587 :
2588 : /*
2589 : * We're only checking whether the superblock is read-only not
2590 : * changing it, so only take down_read(&sb->s_umount).
2591 : */
2592 33 : down_read(&sb->s_umount);
2593 33 : lock_mount_hash();
2594 33 : ret = change_mount_ro_state(mnt, mnt_flags);
2595 33 : if (ret == 0)
2596 33 : set_mount_attributes(mnt, mnt_flags);
2597 33 : unlock_mount_hash();
2598 33 : up_read(&sb->s_umount);
2599 :
2600 33 : mnt_warn_timestamp_expiry(path, &mnt->mnt);
2601 :
2602 33 : return ret;
2603 : }
2604 :
2605 : /*
2606 : * change filesystem flags. dir should be a physical root of filesystem.
2607 : * If you've mounted a non-root directory somewhere and want to do remount
2608 : * on it - tough luck.
2609 : */
2610 2 : static int do_remount(struct path *path, int ms_flags, int sb_flags,
2611 : int mnt_flags, void *data)
2612 : {
2613 2 : int err;
2614 2 : struct super_block *sb = path->mnt->mnt_sb;
2615 2 : struct mount *mnt = real_mount(path->mnt);
2616 2 : struct fs_context *fc;
2617 :
2618 2 : if (!check_mnt(mnt))
2619 : return -EINVAL;
2620 :
2621 2 : if (path->dentry != path->mnt->mnt_root)
2622 : return -EINVAL;
2623 :
2624 2 : if (!can_change_locked_flags(mnt, mnt_flags))
2625 : return -EPERM;
2626 :
2627 2 : fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
2628 2 : if (IS_ERR(fc))
2629 0 : return PTR_ERR(fc);
2630 :
2631 2 : fc->oldapi = true;
2632 2 : err = parse_monolithic_mount_data(fc, data);
2633 2 : if (!err) {
2634 2 : down_write(&sb->s_umount);
2635 2 : err = -EPERM;
2636 2 : if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
2637 2 : err = reconfigure_super(fc);
2638 2 : if (!err) {
2639 2 : lock_mount_hash();
2640 2 : set_mount_attributes(mnt, mnt_flags);
2641 2 : unlock_mount_hash();
2642 : }
2643 : }
2644 2 : up_write(&sb->s_umount);
2645 : }
2646 :
2647 2 : mnt_warn_timestamp_expiry(path, &mnt->mnt);
2648 :
2649 2 : put_fs_context(fc);
2650 2 : return err;
2651 : }
2652 :
2653 0 : static inline int tree_contains_unbindable(struct mount *mnt)
2654 : {
2655 0 : struct mount *p;
2656 0 : for (p = mnt; p; p = next_mnt(p, mnt)) {
2657 0 : if (IS_MNT_UNBINDABLE(p))
2658 : return 1;
2659 : }
2660 : return 0;
2661 : }
2662 :
2663 : /*
2664 : * Check that there aren't references to earlier/same mount namespaces in the
2665 : * specified subtree. Such references can act as pins for mount namespaces
2666 : * that aren't checked by the mount-cycle checking code, thereby allowing
2667 : * cycles to be made.
2668 : */
2669 8 : static bool check_for_nsfs_mounts(struct mount *subtree)
2670 : {
2671 8 : struct mount *p;
2672 8 : bool ret = false;
2673 :
2674 8 : lock_mount_hash();
2675 170 : for (p = subtree; p; p = next_mnt(p, subtree))
2676 77 : if (mnt_ns_loop(p->mnt.mnt_root))
2677 0 : goto out;
2678 :
2679 : ret = true;
2680 8 : out:
2681 8 : unlock_mount_hash();
2682 8 : return ret;
2683 : }
2684 :
2685 8 : static int do_move_mount(struct path *old_path, struct path *new_path)
2686 : {
2687 8 : struct mnt_namespace *ns;
2688 8 : struct mount *p;
2689 8 : struct mount *old;
2690 8 : struct mount *parent;
2691 8 : struct mountpoint *mp, *old_mp;
2692 8 : int err;
2693 8 : bool attached;
2694 :
2695 8 : mp = lock_mount(new_path);
2696 8 : if (IS_ERR(mp))
2697 0 : return PTR_ERR(mp);
2698 :
2699 8 : old = real_mount(old_path->mnt);
2700 8 : p = real_mount(new_path->mnt);
2701 8 : parent = old->mnt_parent;
2702 8 : attached = mnt_has_parent(old);
2703 8 : old_mp = old->mnt_mp;
2704 8 : ns = old->mnt_ns;
2705 :
2706 8 : err = -EINVAL;
2707 : /* The mountpoint must be in our namespace. */
2708 8 : if (!check_mnt(p))
2709 0 : goto out;
2710 :
2711 : /* The thing moved must be mounted... */
2712 16 : if (!is_mounted(&old->mnt))
2713 0 : goto out;
2714 :
2715 : /* ... and either ours or the root of anon namespace */
2716 8 : if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
2717 0 : goto out;
2718 :
2719 8 : if (old->mnt.mnt_flags & MNT_LOCKED)
2720 0 : goto out;
2721 :
2722 8 : if (old_path->dentry != old_path->mnt->mnt_root)
2723 0 : goto out;
2724 :
2725 8 : if (d_is_dir(new_path->dentry) !=
2726 8 : d_is_dir(old_path->dentry))
2727 0 : goto out;
2728 : /*
2729 : * Don't move a mount residing in a shared parent.
2730 : */
2731 8 : if (attached && IS_MNT_SHARED(parent))
2732 0 : goto out;
2733 : /*
2734 : * Don't move a mount tree containing unbindable mounts to a destination
2735 : * mount which is shared.
2736 : */
2737 8 : if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2738 0 : goto out;
2739 8 : err = -ELOOP;
2740 8 : if (!check_for_nsfs_mounts(old))
2741 0 : goto out;
2742 21 : for (; mnt_has_parent(p); p = p->mnt_parent)
2743 13 : if (p == old)
2744 0 : goto out;
2745 :
2746 8 : err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
2747 : attached);
2748 8 : if (err)
2749 0 : goto out;
2750 :
2751 : /* if the mount is moved, it should no longer be expire
2752 : * automatically */
2753 8 : list_del_init(&old->mnt_expire);
2754 8 : if (attached)
2755 8 : put_mountpoint(old_mp);
2756 0 : out:
2757 8 : unlock_mount(mp);
2758 8 : if (!err) {
2759 8 : if (attached)
2760 8 : mntput_no_expire(parent);
2761 : else
2762 0 : free_mnt_ns(ns);
2763 : }
2764 : return err;
2765 : }
2766 :
2767 6 : static int do_move_mount_old(struct path *path, const char *old_name)
2768 : {
2769 6 : struct path old_path;
2770 6 : int err;
2771 :
2772 6 : if (!old_name || !*old_name)
2773 : return -EINVAL;
2774 :
2775 6 : err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2776 6 : if (err)
2777 : return err;
2778 :
2779 6 : err = do_move_mount(&old_path, path);
2780 6 : path_put(&old_path);
2781 6 : return err;
2782 : }
2783 :
2784 : /*
2785 : * add a mount into a namespace's mount tree
2786 : */
2787 115 : static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
2788 : struct path *path, int mnt_flags)
2789 : {
2790 115 : struct mount *parent = real_mount(path->mnt);
2791 :
2792 115 : mnt_flags &= ~MNT_INTERNAL_FLAGS;
2793 :
2794 115 : if (unlikely(!check_mnt(parent))) {
2795 : /* that's acceptable only for automounts done in private ns */
2796 0 : if (!(mnt_flags & MNT_SHRINKABLE))
2797 : return -EINVAL;
2798 : /* ... and for those we'd better have mountpoint still alive */
2799 0 : if (!parent->mnt_ns)
2800 : return -EINVAL;
2801 : }
2802 :
2803 : /* Refuse the same filesystem on the same mount point */
2804 115 : if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2805 1 : path->mnt->mnt_root == path->dentry)
2806 : return -EBUSY;
2807 :
2808 114 : if (d_is_symlink(newmnt->mnt.mnt_root))
2809 : return -EINVAL;
2810 :
2811 114 : newmnt->mnt.mnt_flags = mnt_flags;
2812 114 : return graft_tree(newmnt, parent, mp);
2813 : }
2814 :
2815 : static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
2816 :
2817 : /*
2818 : * Create a new mount using a superblock configuration and request it
2819 : * be added to the namespace tree.
2820 : */
2821 115 : static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
2822 : unsigned int mnt_flags)
2823 : {
2824 115 : struct vfsmount *mnt;
2825 115 : struct mountpoint *mp;
2826 115 : struct super_block *sb = fc->root->d_sb;
2827 115 : int error;
2828 :
2829 115 : error = security_sb_kern_mount(sb);
2830 115 : if (!error && mount_too_revealing(sb, &mnt_flags))
2831 0 : error = -EPERM;
2832 :
2833 115 : if (unlikely(error)) {
2834 0 : fc_drop_locked(fc);
2835 0 : return error;
2836 : }
2837 :
2838 115 : up_write(&sb->s_umount);
2839 :
2840 115 : mnt = vfs_create_mount(fc);
2841 115 : if (IS_ERR(mnt))
2842 0 : return PTR_ERR(mnt);
2843 :
2844 115 : mnt_warn_timestamp_expiry(mountpoint, mnt);
2845 :
2846 115 : mp = lock_mount(mountpoint);
2847 115 : if (IS_ERR(mp)) {
2848 0 : mntput(mnt);
2849 0 : return PTR_ERR(mp);
2850 : }
2851 115 : error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
2852 115 : unlock_mount(mp);
2853 115 : if (error < 0)
2854 1 : mntput(mnt);
2855 : return error;
2856 : }
2857 :
2858 : /*
2859 : * create a new mount for userspace and request it to be added into the
2860 : * namespace's tree
2861 : */
2862 116 : static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
2863 : int mnt_flags, const char *name, void *data)
2864 : {
2865 116 : struct file_system_type *type;
2866 116 : struct fs_context *fc;
2867 116 : const char *subtype = NULL;
2868 116 : int err = 0;
2869 :
2870 116 : if (!fstype)
2871 : return -EINVAL;
2872 :
2873 116 : type = get_fs_type(fstype);
2874 116 : if (!type)
2875 : return -ENODEV;
2876 :
2877 116 : if (type->fs_flags & FS_HAS_SUBTYPE) {
2878 0 : subtype = strchr(fstype, '.');
2879 0 : if (subtype) {
2880 0 : subtype++;
2881 0 : if (!*subtype) {
2882 0 : put_filesystem(type);
2883 0 : return -EINVAL;
2884 : }
2885 : }
2886 : }
2887 :
2888 116 : fc = fs_context_for_mount(type, sb_flags);
2889 116 : put_filesystem(type);
2890 116 : if (IS_ERR(fc))
2891 0 : return PTR_ERR(fc);
2892 :
2893 116 : if (subtype)
2894 0 : err = vfs_parse_fs_string(fc, "subtype",
2895 : subtype, strlen(subtype));
2896 116 : if (!err && name)
2897 116 : err = vfs_parse_fs_string(fc, "source", name, strlen(name));
2898 116 : if (!err)
2899 116 : err = parse_monolithic_mount_data(fc, data);
2900 116 : if (!err && !mount_capable(fc))
2901 : err = -EPERM;
2902 116 : if (!err)
2903 116 : err = vfs_get_tree(fc);
2904 116 : if (!err)
2905 115 : err = do_new_mount_fc(fc, path, mnt_flags);
2906 :
2907 116 : put_fs_context(fc);
2908 116 : return err;
2909 : }
2910 :
2911 0 : int finish_automount(struct vfsmount *m, struct path *path)
2912 : {
2913 0 : struct dentry *dentry = path->dentry;
2914 0 : struct mountpoint *mp;
2915 0 : struct mount *mnt;
2916 0 : int err;
2917 :
2918 0 : if (!m)
2919 : return 0;
2920 0 : if (IS_ERR(m))
2921 0 : return PTR_ERR(m);
2922 :
2923 0 : mnt = real_mount(m);
2924 : /* The new mount record should have at least 2 refs to prevent it being
2925 : * expired before we get a chance to add it
2926 : */
2927 0 : BUG_ON(mnt_get_count(mnt) < 2);
2928 :
2929 0 : if (m->mnt_sb == path->mnt->mnt_sb &&
2930 0 : m->mnt_root == dentry) {
2931 0 : err = -ELOOP;
2932 0 : goto discard;
2933 : }
2934 :
2935 : /*
2936 : * we don't want to use lock_mount() - in this case finding something
2937 : * that overmounts our mountpoint to be means "quitely drop what we've
2938 : * got", not "try to mount it on top".
2939 : */
2940 0 : inode_lock(dentry->d_inode);
2941 0 : namespace_lock();
2942 0 : if (unlikely(cant_mount(dentry))) {
2943 0 : err = -ENOENT;
2944 0 : goto discard_locked;
2945 : }
2946 0 : rcu_read_lock();
2947 0 : if (unlikely(__lookup_mnt(path->mnt, dentry))) {
2948 0 : rcu_read_unlock();
2949 0 : err = 0;
2950 0 : goto discard_locked;
2951 : }
2952 0 : rcu_read_unlock();
2953 0 : mp = get_mountpoint(dentry);
2954 0 : if (IS_ERR(mp)) {
2955 0 : err = PTR_ERR(mp);
2956 0 : goto discard_locked;
2957 : }
2958 :
2959 0 : err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
2960 0 : unlock_mount(mp);
2961 0 : if (unlikely(err))
2962 0 : goto discard;
2963 0 : mntput(m);
2964 0 : return 0;
2965 :
2966 0 : discard_locked:
2967 0 : namespace_unlock();
2968 0 : inode_unlock(dentry->d_inode);
2969 0 : discard:
2970 : /* remove m from any expiration list it may be on */
2971 0 : if (!list_empty(&mnt->mnt_expire)) {
2972 0 : namespace_lock();
2973 0 : list_del_init(&mnt->mnt_expire);
2974 0 : namespace_unlock();
2975 : }
2976 0 : mntput(m);
2977 0 : mntput(m);
2978 0 : return err;
2979 : }
2980 :
2981 : /**
2982 : * mnt_set_expiry - Put a mount on an expiration list
2983 : * @mnt: The mount to list.
2984 : * @expiry_list: The list to add the mount to.
2985 : */
2986 0 : void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
2987 : {
2988 0 : namespace_lock();
2989 :
2990 0 : list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
2991 :
2992 0 : namespace_unlock();
2993 0 : }
2994 : EXPORT_SYMBOL(mnt_set_expiry);
2995 :
2996 : /*
2997 : * process a list of expirable mountpoints with the intent of discarding any
2998 : * mountpoints that aren't in use and haven't been touched since last we came
2999 : * here
3000 : */
3001 0 : void mark_mounts_for_expiry(struct list_head *mounts)
3002 : {
3003 0 : struct mount *mnt, *next;
3004 0 : LIST_HEAD(graveyard);
3005 :
3006 0 : if (list_empty(mounts))
3007 0 : return;
3008 :
3009 0 : namespace_lock();
3010 0 : lock_mount_hash();
3011 :
3012 : /* extract from the expiration list every vfsmount that matches the
3013 : * following criteria:
3014 : * - only referenced by its parent vfsmount
3015 : * - still marked for expiry (marked on the last call here; marks are
3016 : * cleared by mntput())
3017 : */
3018 0 : list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3019 0 : if (!xchg(&mnt->mnt_expiry_mark, 1) ||
3020 0 : propagate_mount_busy(mnt, 1))
3021 0 : continue;
3022 0 : list_move(&mnt->mnt_expire, &graveyard);
3023 : }
3024 0 : while (!list_empty(&graveyard)) {
3025 0 : mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3026 0 : touch_mnt_namespace(mnt->mnt_ns);
3027 0 : umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3028 : }
3029 0 : unlock_mount_hash();
3030 0 : namespace_unlock();
3031 : }
3032 :
3033 : EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
3034 :
3035 : /*
3036 : * Ripoff of 'select_parent()'
3037 : *
3038 : * search the list of submounts for a given mountpoint, and move any
3039 : * shrinkable submounts to the 'graveyard' list.
3040 : */
3041 111 : static int select_submounts(struct mount *parent, struct list_head *graveyard)
3042 : {
3043 111 : struct mount *this_parent = parent;
3044 111 : struct list_head *next;
3045 111 : int found = 0;
3046 :
3047 111 : repeat:
3048 111 : next = this_parent->mnt_mounts.next;
3049 : resume:
3050 123 : while (next != &this_parent->mnt_mounts) {
3051 12 : struct list_head *tmp = next;
3052 12 : struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
3053 :
3054 12 : next = tmp->next;
3055 12 : if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
3056 12 : continue;
3057 : /*
3058 : * Descend a level if the d_mounts list is non-empty.
3059 : */
3060 0 : if (!list_empty(&mnt->mnt_mounts)) {
3061 0 : this_parent = mnt;
3062 0 : goto repeat;
3063 : }
3064 :
3065 0 : if (!propagate_mount_busy(mnt, 1)) {
3066 0 : list_move_tail(&mnt->mnt_expire, graveyard);
3067 0 : found++;
3068 : }
3069 : }
3070 : /*
3071 : * All done at this level ... ascend and resume the search
3072 : */
3073 111 : if (this_parent != parent) {
3074 0 : next = this_parent->mnt_child.next;
3075 0 : this_parent = this_parent->mnt_parent;
3076 0 : goto resume;
3077 : }
3078 111 : return found;
3079 : }
3080 :
3081 : /*
3082 : * process a list of expirable mountpoints with the intent of discarding any
3083 : * submounts of a specific parent mountpoint
3084 : *
3085 : * mount_lock must be held for write
3086 : */
3087 111 : static void shrink_submounts(struct mount *mnt)
3088 : {
3089 111 : LIST_HEAD(graveyard);
3090 111 : struct mount *m;
3091 :
3092 : /* extract submounts of 'mountpoint' from the expiration list */
3093 111 : while (select_submounts(mnt, &graveyard)) {
3094 0 : while (!list_empty(&graveyard)) {
3095 0 : m = list_first_entry(&graveyard, struct mount,
3096 : mnt_expire);
3097 0 : touch_mnt_namespace(m->mnt_ns);
3098 0 : umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3099 : }
3100 : }
3101 111 : }
3102 :
3103 262 : static void *copy_mount_options(const void __user * data)
3104 : {
3105 262 : char *copy;
3106 262 : unsigned left, offset;
3107 :
3108 262 : if (!data)
3109 : return NULL;
3110 :
3111 109 : copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
3112 109 : if (!copy)
3113 262 : return ERR_PTR(-ENOMEM);
3114 :
3115 109 : left = copy_from_user(copy, data, PAGE_SIZE);
3116 :
3117 : /*
3118 : * Not all architectures have an exact copy_from_user(). Resort to
3119 : * byte at a time.
3120 : */
3121 109 : offset = PAGE_SIZE - left;
3122 109 : while (left) {
3123 0 : char c;
3124 0 : if (get_user(c, (const char __user *)data + offset))
3125 : break;
3126 0 : copy[offset] = c;
3127 0 : left--;
3128 0 : offset++;
3129 : }
3130 :
3131 109 : if (left == PAGE_SIZE) {
3132 0 : kfree(copy);
3133 0 : return ERR_PTR(-EFAULT);
3134 : }
3135 :
3136 : return copy;
3137 : }
3138 :
3139 524 : static char *copy_mount_string(const void __user *data)
3140 : {
3141 295 : return data ? strndup_user(data, PATH_MAX) : NULL;
3142 : }
3143 :
3144 : /*
3145 : * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
3146 : * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
3147 : *
3148 : * data is a (void *) that can point to any structure up to
3149 : * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
3150 : * information (or be NULL).
3151 : *
3152 : * Pre-0.97 versions of mount() didn't have a flags word.
3153 : * When the flags word was introduced its top half was required
3154 : * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
3155 : * Therefore, if this magic number is present, it carries no information
3156 : * and must be discarded.
3157 : */
3158 250 : int path_mount(const char *dev_name, struct path *path,
3159 : const char *type_page, unsigned long flags, void *data_page)
3160 : {
3161 250 : unsigned int mnt_flags = 0, sb_flags;
3162 250 : int ret;
3163 :
3164 : /* Discard magic */
3165 250 : if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3166 0 : flags &= ~MS_MGC_MSK;
3167 :
3168 : /* Basic sanity checks */
3169 250 : if (data_page)
3170 108 : ((char *)data_page)[PAGE_SIZE - 1] = 0;
3171 :
3172 250 : if (flags & MS_NOUSER)
3173 : return -EINVAL;
3174 :
3175 250 : ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
3176 250 : if (ret)
3177 : return ret;
3178 249 : if (!may_mount())
3179 : return -EPERM;
3180 249 : if ((flags & SB_MANDLOCK) && !may_mandlock())
3181 0 : return -EPERM;
3182 :
3183 : /* Default to relatime unless overriden */
3184 249 : if (!(flags & MS_NOATIME))
3185 249 : mnt_flags |= MNT_RELATIME;
3186 :
3187 : /* Separate the per-mountpoint flags */
3188 249 : if (flags & MS_NOSUID)
3189 37 : mnt_flags |= MNT_NOSUID;
3190 249 : if (flags & MS_NODEV)
3191 32 : mnt_flags |= MNT_NODEV;
3192 249 : if (flags & MS_NOEXEC)
3193 27 : mnt_flags |= MNT_NOEXEC;
3194 249 : if (flags & MS_NOATIME)
3195 0 : mnt_flags |= MNT_NOATIME;
3196 249 : if (flags & MS_NODIRATIME)
3197 0 : mnt_flags |= MNT_NODIRATIME;
3198 249 : if (flags & MS_STRICTATIME)
3199 8 : mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
3200 249 : if (flags & MS_RDONLY)
3201 36 : mnt_flags |= MNT_READONLY;
3202 249 : if (flags & MS_NOSYMFOLLOW)
3203 0 : mnt_flags |= MNT_NOSYMFOLLOW;
3204 :
3205 : /* The default atime for remount is preservation */
3206 249 : if ((flags & MS_REMOUNT) &&
3207 : ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
3208 : MS_STRICTATIME)) == 0)) {
3209 32 : mnt_flags &= ~MNT_ATIME_MASK;
3210 32 : mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
3211 : }
3212 :
3213 249 : sb_flags = flags & (SB_RDONLY |
3214 : SB_SYNCHRONOUS |
3215 : SB_MANDLOCK |
3216 : SB_DIRSYNC |
3217 : SB_SILENT |
3218 : SB_POSIXACL |
3219 : SB_LAZYTIME |
3220 : SB_I_VERSION);
3221 :
3222 249 : if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
3223 33 : return do_reconfigure_mnt(path, mnt_flags);
3224 216 : if (flags & MS_REMOUNT)
3225 2 : return do_remount(path, flags, sb_flags, mnt_flags, data_page);
3226 214 : if (flags & MS_BIND)
3227 39 : return do_loopback(path, dev_name, flags & MS_REC);
3228 175 : if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
3229 53 : return do_change_type(path, flags);
3230 122 : if (flags & MS_MOVE)
3231 6 : return do_move_mount_old(path, dev_name);
3232 :
3233 116 : return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
3234 : data_page);
3235 : }
3236 :
3237 262 : long do_mount(const char *dev_name, const char __user *dir_name,
3238 : const char *type_page, unsigned long flags, void *data_page)
3239 : {
3240 262 : struct path path;
3241 262 : int ret;
3242 :
3243 262 : ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
3244 262 : if (ret)
3245 17 : return ret;
3246 245 : ret = path_mount(dev_name, &path, type_page, flags, data_page);
3247 245 : path_put(&path);
3248 245 : return ret;
3249 : }
3250 :
3251 51 : static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
3252 : {
3253 51 : return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
3254 : }
3255 :
3256 47 : static void dec_mnt_namespaces(struct ucounts *ucounts)
3257 : {
3258 47 : dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
3259 : }
3260 :
3261 47 : static void free_mnt_ns(struct mnt_namespace *ns)
3262 : {
3263 47 : if (!is_anon_ns(ns))
3264 47 : ns_free_inum(&ns->ns);
3265 47 : dec_mnt_namespaces(ns->ucounts);
3266 47 : put_user_ns(ns->user_ns);
3267 47 : kfree(ns);
3268 47 : }
3269 :
3270 : /*
3271 : * Assign a sequence number so we can detect when we attempt to bind
3272 : * mount a reference to an older mount namespace into the current
3273 : * mount namespace, preventing reference counting loops. A 64bit
3274 : * number incrementing at 10Ghz will take 12,427 years to wrap which
3275 : * is effectively never, so we can ignore the possibility.
3276 : */
3277 : static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
3278 :
3279 51 : static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
3280 : {
3281 51 : struct mnt_namespace *new_ns;
3282 51 : struct ucounts *ucounts;
3283 51 : int ret;
3284 :
3285 51 : ucounts = inc_mnt_namespaces(user_ns);
3286 51 : if (!ucounts)
3287 51 : return ERR_PTR(-ENOSPC);
3288 :
3289 51 : new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
3290 51 : if (!new_ns) {
3291 0 : dec_mnt_namespaces(ucounts);
3292 0 : return ERR_PTR(-ENOMEM);
3293 : }
3294 51 : if (!anon) {
3295 51 : ret = ns_alloc_inum(&new_ns->ns);
3296 51 : if (ret) {
3297 0 : kfree(new_ns);
3298 0 : dec_mnt_namespaces(ucounts);
3299 0 : return ERR_PTR(ret);
3300 : }
3301 : }
3302 51 : new_ns->ns.ops = &mntns_operations;
3303 51 : if (!anon)
3304 102 : new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
3305 51 : refcount_set(&new_ns->ns.count, 1);
3306 51 : INIT_LIST_HEAD(&new_ns->list);
3307 51 : init_waitqueue_head(&new_ns->poll);
3308 51 : spin_lock_init(&new_ns->ns_lock);
3309 51 : new_ns->user_ns = get_user_ns(user_ns);
3310 51 : new_ns->ucounts = ucounts;
3311 51 : return new_ns;
3312 : }
3313 :
3314 : __latent_entropy
3315 50 : struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
3316 : struct user_namespace *user_ns, struct fs_struct *new_fs)
3317 : {
3318 50 : struct mnt_namespace *new_ns;
3319 50 : struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
3320 50 : struct mount *p, *q;
3321 50 : struct mount *old;
3322 50 : struct mount *new;
3323 50 : int copy_flags;
3324 :
3325 50 : BUG_ON(!ns);
3326 :
3327 50 : if (likely(!(flags & CLONE_NEWNS))) {
3328 0 : get_mnt_ns(ns);
3329 0 : return ns;
3330 : }
3331 :
3332 50 : old = ns->root;
3333 :
3334 50 : new_ns = alloc_mnt_ns(user_ns, false);
3335 50 : if (IS_ERR(new_ns))
3336 : return new_ns;
3337 :
3338 50 : namespace_lock();
3339 : /* First pass: copy the tree topology */
3340 50 : copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
3341 50 : if (user_ns != ns->user_ns)
3342 0 : copy_flags |= CL_SHARED_TO_SLAVE;
3343 50 : new = copy_tree(old, old->mnt.mnt_root, copy_flags);
3344 50 : if (IS_ERR(new)) {
3345 0 : namespace_unlock();
3346 0 : free_mnt_ns(new_ns);
3347 0 : return ERR_CAST(new);
3348 : }
3349 50 : if (user_ns != ns->user_ns) {
3350 0 : lock_mount_hash();
3351 0 : lock_mnt_tree(new);
3352 0 : unlock_mount_hash();
3353 : }
3354 50 : new_ns->root = new;
3355 50 : list_add_tail(&new_ns->list, &new->mnt_list);
3356 :
3357 : /*
3358 : * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
3359 : * as belonging to new namespace. We have already acquired a private
3360 : * fs_struct, so tsk->fs->lock is not needed.
3361 : */
3362 50 : p = old;
3363 50 : q = new;
3364 733 : while (p) {
3365 733 : q->mnt_ns = new_ns;
3366 733 : new_ns->mounts++;
3367 733 : if (new_fs) {
3368 733 : if (&p->mnt == new_fs->root.mnt) {
3369 50 : new_fs->root.mnt = mntget(&q->mnt);
3370 50 : rootmnt = &p->mnt;
3371 : }
3372 733 : if (&p->mnt == new_fs->pwd.mnt) {
3373 50 : new_fs->pwd.mnt = mntget(&q->mnt);
3374 50 : pwdmnt = &p->mnt;
3375 : }
3376 : }
3377 733 : p = next_mnt(p, old);
3378 733 : q = next_mnt(q, new);
3379 733 : if (!q)
3380 : break;
3381 683 : while (p->mnt.mnt_root != q->mnt.mnt_root)
3382 683 : p = next_mnt(p, old);
3383 : }
3384 50 : namespace_unlock();
3385 :
3386 50 : if (rootmnt)
3387 50 : mntput(rootmnt);
3388 50 : if (pwdmnt)
3389 50 : mntput(pwdmnt);
3390 :
3391 : return new_ns;
3392 : }
3393 :
3394 0 : struct dentry *mount_subtree(struct vfsmount *m, const char *name)
3395 : {
3396 0 : struct mount *mnt = real_mount(m);
3397 0 : struct mnt_namespace *ns;
3398 0 : struct super_block *s;
3399 0 : struct path path;
3400 0 : int err;
3401 :
3402 0 : ns = alloc_mnt_ns(&init_user_ns, true);
3403 0 : if (IS_ERR(ns)) {
3404 0 : mntput(m);
3405 0 : return ERR_CAST(ns);
3406 : }
3407 0 : mnt->mnt_ns = ns;
3408 0 : ns->root = mnt;
3409 0 : ns->mounts++;
3410 0 : list_add(&mnt->mnt_list, &ns->list);
3411 :
3412 0 : err = vfs_path_lookup(m->mnt_root, m,
3413 : name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
3414 :
3415 0 : put_mnt_ns(ns);
3416 :
3417 0 : if (err)
3418 0 : return ERR_PTR(err);
3419 :
3420 : /* trade a vfsmount reference for active sb one */
3421 0 : s = path.mnt->mnt_sb;
3422 0 : atomic_inc(&s->s_active);
3423 0 : mntput(path.mnt);
3424 : /* lock the sucker */
3425 0 : down_write(&s->s_umount);
3426 : /* ... and return the root of (sub)tree on it */
3427 0 : return path.dentry;
3428 : }
3429 : EXPORT_SYMBOL(mount_subtree);
3430 :
3431 524 : SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
3432 : char __user *, type, unsigned long, flags, void __user *, data)
3433 : {
3434 262 : int ret;
3435 262 : char *kernel_type;
3436 262 : char *kernel_dev;
3437 262 : void *options;
3438 :
3439 262 : kernel_type = copy_mount_string(type);
3440 262 : ret = PTR_ERR(kernel_type);
3441 262 : if (IS_ERR(kernel_type))
3442 0 : goto out_type;
3443 :
3444 262 : kernel_dev = copy_mount_string(dev_name);
3445 262 : ret = PTR_ERR(kernel_dev);
3446 262 : if (IS_ERR(kernel_dev))
3447 0 : goto out_dev;
3448 :
3449 262 : options = copy_mount_options(data);
3450 262 : ret = PTR_ERR(options);
3451 262 : if (IS_ERR(options))
3452 0 : goto out_data;
3453 :
3454 262 : ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
3455 :
3456 262 : kfree(options);
3457 262 : out_data:
3458 262 : kfree(kernel_dev);
3459 262 : out_dev:
3460 262 : kfree(kernel_type);
3461 262 : out_type:
3462 262 : return ret;
3463 : }
3464 :
3465 : #define FSMOUNT_VALID_FLAGS \
3466 : (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
3467 : MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
3468 :
3469 : #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
3470 :
3471 : #define MOUNT_SETATTR_PROPAGATION_FLAGS \
3472 : (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
3473 :
3474 0 : static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
3475 : {
3476 0 : unsigned int mnt_flags = 0;
3477 :
3478 0 : if (attr_flags & MOUNT_ATTR_RDONLY)
3479 0 : mnt_flags |= MNT_READONLY;
3480 0 : if (attr_flags & MOUNT_ATTR_NOSUID)
3481 0 : mnt_flags |= MNT_NOSUID;
3482 0 : if (attr_flags & MOUNT_ATTR_NODEV)
3483 0 : mnt_flags |= MNT_NODEV;
3484 0 : if (attr_flags & MOUNT_ATTR_NOEXEC)
3485 0 : mnt_flags |= MNT_NOEXEC;
3486 0 : if (attr_flags & MOUNT_ATTR_NODIRATIME)
3487 0 : mnt_flags |= MNT_NODIRATIME;
3488 :
3489 0 : return mnt_flags;
3490 : }
3491 :
3492 : /*
3493 : * Create a kernel mount representation for a new, prepared superblock
3494 : * (specified by fs_fd) and attach to an open_tree-like file descriptor.
3495 : */
3496 0 : SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
3497 : unsigned int, attr_flags)
3498 : {
3499 0 : struct mnt_namespace *ns;
3500 0 : struct fs_context *fc;
3501 0 : struct file *file;
3502 0 : struct path newmount;
3503 0 : struct mount *mnt;
3504 0 : struct fd f;
3505 0 : unsigned int mnt_flags = 0;
3506 0 : long ret;
3507 :
3508 0 : if (!may_mount())
3509 : return -EPERM;
3510 :
3511 0 : if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
3512 : return -EINVAL;
3513 :
3514 0 : if (attr_flags & ~FSMOUNT_VALID_FLAGS)
3515 : return -EINVAL;
3516 :
3517 0 : mnt_flags = attr_flags_to_mnt_flags(attr_flags);
3518 :
3519 0 : switch (attr_flags & MOUNT_ATTR__ATIME) {
3520 : case MOUNT_ATTR_STRICTATIME:
3521 : break;
3522 0 : case MOUNT_ATTR_NOATIME:
3523 0 : mnt_flags |= MNT_NOATIME;
3524 0 : break;
3525 0 : case MOUNT_ATTR_RELATIME:
3526 0 : mnt_flags |= MNT_RELATIME;
3527 0 : break;
3528 : default:
3529 : return -EINVAL;
3530 : }
3531 :
3532 0 : f = fdget(fs_fd);
3533 0 : if (!f.file)
3534 : return -EBADF;
3535 :
3536 0 : ret = -EINVAL;
3537 0 : if (f.file->f_op != &fscontext_fops)
3538 0 : goto err_fsfd;
3539 :
3540 0 : fc = f.file->private_data;
3541 :
3542 0 : ret = mutex_lock_interruptible(&fc->uapi_mutex);
3543 0 : if (ret < 0)
3544 0 : goto err_fsfd;
3545 :
3546 : /* There must be a valid superblock or we can't mount it */
3547 0 : ret = -EINVAL;
3548 0 : if (!fc->root)
3549 0 : goto err_unlock;
3550 :
3551 0 : ret = -EPERM;
3552 0 : if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
3553 0 : pr_warn("VFS: Mount too revealing\n");
3554 0 : goto err_unlock;
3555 : }
3556 :
3557 0 : ret = -EBUSY;
3558 0 : if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
3559 0 : goto err_unlock;
3560 :
3561 0 : ret = -EPERM;
3562 0 : if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
3563 0 : goto err_unlock;
3564 :
3565 0 : newmount.mnt = vfs_create_mount(fc);
3566 0 : if (IS_ERR(newmount.mnt)) {
3567 0 : ret = PTR_ERR(newmount.mnt);
3568 0 : goto err_unlock;
3569 : }
3570 0 : newmount.dentry = dget(fc->root);
3571 0 : newmount.mnt->mnt_flags = mnt_flags;
3572 :
3573 : /* We've done the mount bit - now move the file context into more or
3574 : * less the same state as if we'd done an fspick(). We don't want to
3575 : * do any memory allocation or anything like that at this point as we
3576 : * don't want to have to handle any errors incurred.
3577 : */
3578 0 : vfs_clean_context(fc);
3579 :
3580 0 : ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
3581 0 : if (IS_ERR(ns)) {
3582 0 : ret = PTR_ERR(ns);
3583 0 : goto err_path;
3584 : }
3585 0 : mnt = real_mount(newmount.mnt);
3586 0 : mnt->mnt_ns = ns;
3587 0 : ns->root = mnt;
3588 0 : ns->mounts = 1;
3589 0 : list_add(&mnt->mnt_list, &ns->list);
3590 0 : mntget(newmount.mnt);
3591 :
3592 : /* Attach to an apparent O_PATH fd with a note that we need to unmount
3593 : * it, not just simply put it.
3594 : */
3595 0 : file = dentry_open(&newmount, O_PATH, fc->cred);
3596 0 : if (IS_ERR(file)) {
3597 0 : dissolve_on_fput(newmount.mnt);
3598 0 : ret = PTR_ERR(file);
3599 0 : goto err_path;
3600 : }
3601 0 : file->f_mode |= FMODE_NEED_UNMOUNT;
3602 :
3603 0 : ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
3604 0 : if (ret >= 0)
3605 0 : fd_install(ret, file);
3606 : else
3607 0 : fput(file);
3608 :
3609 0 : err_path:
3610 0 : path_put(&newmount);
3611 0 : err_unlock:
3612 0 : mutex_unlock(&fc->uapi_mutex);
3613 0 : err_fsfd:
3614 0 : fdput(f);
3615 0 : return ret;
3616 : }
3617 :
3618 : /*
3619 : * Move a mount from one place to another. In combination with
3620 : * fsopen()/fsmount() this is used to install a new mount and in combination
3621 : * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
3622 : * a mount subtree.
3623 : *
3624 : * Note the flags value is a combination of MOVE_MOUNT_* flags.
3625 : */
3626 6 : SYSCALL_DEFINE5(move_mount,
3627 : int, from_dfd, const char __user *, from_pathname,
3628 : int, to_dfd, const char __user *, to_pathname,
3629 : unsigned int, flags)
3630 : {
3631 3 : struct path from_path, to_path;
3632 3 : unsigned int lflags;
3633 3 : int ret = 0;
3634 :
3635 3 : if (!may_mount())
3636 : return -EPERM;
3637 :
3638 3 : if (flags & ~MOVE_MOUNT__MASK)
3639 : return -EINVAL;
3640 :
3641 : /* If someone gives a pathname, they aren't permitted to move
3642 : * from an fd that requires unmount as we can't get at the flag
3643 : * to clear it afterwards.
3644 : */
3645 3 : lflags = 0;
3646 3 : if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
3647 3 : if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
3648 3 : if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
3649 :
3650 3 : ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
3651 3 : if (ret < 0)
3652 0 : return ret;
3653 :
3654 3 : lflags = 0;
3655 3 : if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
3656 3 : if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
3657 3 : if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
3658 :
3659 3 : ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
3660 3 : if (ret < 0)
3661 0 : goto out_from;
3662 :
3663 3 : ret = security_move_mount(&from_path, &to_path);
3664 3 : if (ret < 0)
3665 1 : goto out_to;
3666 :
3667 2 : ret = do_move_mount(&from_path, &to_path);
3668 :
3669 3 : out_to:
3670 3 : path_put(&to_path);
3671 3 : out_from:
3672 3 : path_put(&from_path);
3673 3 : return ret;
3674 : }
3675 :
3676 : /*
3677 : * Return true if path is reachable from root
3678 : *
3679 : * namespace_sem or mount_lock is held
3680 : */
3681 2 : bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
3682 : const struct path *root)
3683 : {
3684 4 : while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
3685 2 : dentry = mnt->mnt_mountpoint;
3686 2 : mnt = mnt->mnt_parent;
3687 : }
3688 2 : return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
3689 : }
3690 :
3691 0 : bool path_is_under(const struct path *path1, const struct path *path2)
3692 : {
3693 0 : bool res;
3694 0 : read_seqlock_excl(&mount_lock);
3695 0 : res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
3696 0 : read_sequnlock_excl(&mount_lock);
3697 0 : return res;
3698 : }
3699 : EXPORT_SYMBOL(path_is_under);
3700 :
3701 : /*
3702 : * pivot_root Semantics:
3703 : * Moves the root file system of the current process to the directory put_old,
3704 : * makes new_root as the new root file system of the current process, and sets
3705 : * root/cwd of all processes which had them on the current root to new_root.
3706 : *
3707 : * Restrictions:
3708 : * The new_root and put_old must be directories, and must not be on the
3709 : * same file system as the current process root. The put_old must be
3710 : * underneath new_root, i.e. adding a non-zero number of /.. to the string
3711 : * pointed to by put_old must yield the same directory as new_root. No other
3712 : * file system may be mounted on put_old. After all, new_root is a mountpoint.
3713 : *
3714 : * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
3715 : * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
3716 : * in this situation.
3717 : *
3718 : * Notes:
3719 : * - we don't move root/cwd if they are not at the root (reason: if something
3720 : * cared enough to change them, it's probably wrong to force them elsewhere)
3721 : * - it's okay to pick a root that isn't the root of a file system, e.g.
3722 : * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
3723 : * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
3724 : * first.
3725 : */
3726 4 : SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
3727 : const char __user *, put_old)
3728 : {
3729 2 : struct path new, old, root;
3730 2 : struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
3731 2 : struct mountpoint *old_mp, *root_mp;
3732 2 : int error;
3733 :
3734 2 : if (!may_mount())
3735 : return -EPERM;
3736 :
3737 2 : error = user_path_at(AT_FDCWD, new_root,
3738 : LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
3739 2 : if (error)
3740 0 : goto out0;
3741 :
3742 2 : error = user_path_at(AT_FDCWD, put_old,
3743 : LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
3744 2 : if (error)
3745 0 : goto out1;
3746 :
3747 2 : error = security_sb_pivotroot(&old, &new);
3748 2 : if (error)
3749 1 : goto out2;
3750 :
3751 1 : get_fs_root(current->fs, &root);
3752 1 : old_mp = lock_mount(&old);
3753 1 : error = PTR_ERR(old_mp);
3754 1 : if (IS_ERR(old_mp))
3755 0 : goto out3;
3756 :
3757 1 : error = -EINVAL;
3758 1 : new_mnt = real_mount(new.mnt);
3759 1 : root_mnt = real_mount(root.mnt);
3760 1 : old_mnt = real_mount(old.mnt);
3761 1 : ex_parent = new_mnt->mnt_parent;
3762 1 : root_parent = root_mnt->mnt_parent;
3763 1 : if (IS_MNT_SHARED(old_mnt) ||
3764 1 : IS_MNT_SHARED(ex_parent) ||
3765 1 : IS_MNT_SHARED(root_parent))
3766 0 : goto out4;
3767 1 : if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
3768 0 : goto out4;
3769 1 : if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
3770 0 : goto out4;
3771 1 : error = -ENOENT;
3772 1 : if (d_unlinked(new.dentry))
3773 0 : goto out4;
3774 1 : error = -EBUSY;
3775 1 : if (new_mnt == root_mnt || old_mnt == root_mnt)
3776 0 : goto out4; /* loop, on the same file system */
3777 1 : error = -EINVAL;
3778 1 : if (root.mnt->mnt_root != root.dentry)
3779 0 : goto out4; /* not a mountpoint */
3780 1 : if (!mnt_has_parent(root_mnt))
3781 0 : goto out4; /* not attached */
3782 1 : if (new.mnt->mnt_root != new.dentry)
3783 0 : goto out4; /* not a mountpoint */
3784 1 : if (!mnt_has_parent(new_mnt))
3785 0 : goto out4; /* not attached */
3786 : /* make sure we can reach put_old from new_root */
3787 1 : if (!is_path_reachable(old_mnt, old.dentry, &new))
3788 0 : goto out4;
3789 : /* make certain new is below the root */
3790 1 : if (!is_path_reachable(new_mnt, new.dentry, &root))
3791 0 : goto out4;
3792 1 : lock_mount_hash();
3793 1 : umount_mnt(new_mnt);
3794 1 : root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
3795 1 : if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
3796 0 : new_mnt->mnt.mnt_flags |= MNT_LOCKED;
3797 0 : root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
3798 : }
3799 : /* mount old root on put_old */
3800 1 : attach_mnt(root_mnt, old_mnt, old_mp);
3801 : /* mount new_root on / */
3802 1 : attach_mnt(new_mnt, root_parent, root_mp);
3803 1 : mnt_add_count(root_parent, -1);
3804 1 : touch_mnt_namespace(current->nsproxy->mnt_ns);
3805 : /* A moved mount should not expire automatically */
3806 1 : list_del_init(&new_mnt->mnt_expire);
3807 1 : put_mountpoint(root_mp);
3808 1 : unlock_mount_hash();
3809 1 : chroot_fs_refs(&root, &new);
3810 1 : error = 0;
3811 1 : out4:
3812 1 : unlock_mount(old_mp);
3813 1 : if (!error)
3814 1 : mntput_no_expire(ex_parent);
3815 0 : out3:
3816 1 : path_put(&root);
3817 2 : out2:
3818 2 : path_put(&old);
3819 2 : out1:
3820 2 : path_put(&new);
3821 2 : out0:
3822 2 : return error;
3823 : }
3824 :
3825 0 : static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
3826 : {
3827 0 : unsigned int flags = mnt->mnt.mnt_flags;
3828 :
3829 : /* flags to clear */
3830 0 : flags &= ~kattr->attr_clr;
3831 : /* flags to raise */
3832 0 : flags |= kattr->attr_set;
3833 :
3834 0 : return flags;
3835 : }
3836 :
3837 0 : static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
3838 : {
3839 0 : struct vfsmount *m = &mnt->mnt;
3840 :
3841 0 : if (!kattr->mnt_userns)
3842 : return 0;
3843 :
3844 : /*
3845 : * Once a mount has been idmapped we don't allow it to change its
3846 : * mapping. It makes things simpler and callers can just create
3847 : * another bind-mount they can idmap if they want to.
3848 : */
3849 0 : if (mnt_user_ns(m) != &init_user_ns)
3850 : return -EPERM;
3851 :
3852 : /* The underlying filesystem doesn't support idmapped mounts yet. */
3853 0 : if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
3854 : return -EINVAL;
3855 :
3856 : /* We're not controlling the superblock. */
3857 0 : if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
3858 : return -EPERM;
3859 :
3860 : /* Mount has already been visible in the filesystem hierarchy. */
3861 0 : if (!is_anon_ns(mnt->mnt_ns))
3862 0 : return -EINVAL;
3863 :
3864 : return 0;
3865 : }
3866 :
3867 0 : static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
3868 : struct mount *mnt, int *err)
3869 : {
3870 0 : struct mount *m = mnt, *last = NULL;
3871 :
3872 0 : if (!is_mounted(&m->mnt)) {
3873 0 : *err = -EINVAL;
3874 0 : goto out;
3875 : }
3876 :
3877 0 : if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
3878 0 : *err = -EINVAL;
3879 0 : goto out;
3880 : }
3881 :
3882 0 : do {
3883 0 : unsigned int flags;
3884 :
3885 0 : flags = recalc_flags(kattr, m);
3886 0 : if (!can_change_locked_flags(m, flags)) {
3887 0 : *err = -EPERM;
3888 0 : goto out;
3889 : }
3890 :
3891 0 : *err = can_idmap_mount(kattr, m);
3892 0 : if (*err)
3893 0 : goto out;
3894 :
3895 0 : last = m;
3896 :
3897 0 : if ((kattr->attr_set & MNT_READONLY) &&
3898 0 : !(m->mnt.mnt_flags & MNT_READONLY)) {
3899 0 : *err = mnt_hold_writers(m);
3900 0 : if (*err)
3901 0 : goto out;
3902 : }
3903 0 : } while (kattr->recurse && (m = next_mnt(m, mnt)));
3904 :
3905 0 : out:
3906 0 : return last;
3907 : }
3908 :
3909 0 : static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
3910 : {
3911 0 : struct user_namespace *mnt_userns;
3912 :
3913 0 : if (!kattr->mnt_userns)
3914 : return;
3915 :
3916 0 : mnt_userns = get_user_ns(kattr->mnt_userns);
3917 : /* Pairs with smp_load_acquire() in mnt_user_ns(). */
3918 0 : smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
3919 : }
3920 :
3921 0 : static void mount_setattr_commit(struct mount_kattr *kattr,
3922 : struct mount *mnt, struct mount *last,
3923 : int err)
3924 : {
3925 0 : struct mount *m = mnt;
3926 :
3927 0 : do {
3928 0 : if (!err) {
3929 0 : unsigned int flags;
3930 :
3931 0 : do_idmap_mount(kattr, m);
3932 0 : flags = recalc_flags(kattr, m);
3933 0 : WRITE_ONCE(m->mnt.mnt_flags, flags);
3934 : }
3935 :
3936 : /*
3937 : * We either set MNT_READONLY above so make it visible
3938 : * before ~MNT_WRITE_HOLD or we failed to recursively
3939 : * apply mount options.
3940 : */
3941 0 : if ((kattr->attr_set & MNT_READONLY) &&
3942 0 : (m->mnt.mnt_flags & MNT_WRITE_HOLD))
3943 0 : mnt_unhold_writers(m);
3944 :
3945 0 : if (!err && kattr->propagation)
3946 0 : change_mnt_propagation(m, kattr->propagation);
3947 :
3948 : /*
3949 : * On failure, only cleanup until we found the first mount
3950 : * we failed to handle.
3951 : */
3952 0 : if (err && m == last)
3953 : break;
3954 0 : } while (kattr->recurse && (m = next_mnt(m, mnt)));
3955 :
3956 0 : if (!err)
3957 0 : touch_mnt_namespace(mnt->mnt_ns);
3958 0 : }
3959 :
3960 0 : static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
3961 : {
3962 0 : struct mount *mnt = real_mount(path->mnt), *last = NULL;
3963 0 : int err = 0;
3964 :
3965 0 : if (path->dentry != mnt->mnt.mnt_root)
3966 : return -EINVAL;
3967 :
3968 0 : if (kattr->propagation) {
3969 : /*
3970 : * Only take namespace_lock() if we're actually changing
3971 : * propagation.
3972 : */
3973 0 : namespace_lock();
3974 0 : if (kattr->propagation == MS_SHARED) {
3975 0 : err = invent_group_ids(mnt, kattr->recurse);
3976 0 : if (err) {
3977 0 : namespace_unlock();
3978 0 : return err;
3979 : }
3980 : }
3981 : }
3982 :
3983 0 : lock_mount_hash();
3984 :
3985 : /*
3986 : * Get the mount tree in a shape where we can change mount
3987 : * properties without failure.
3988 : */
3989 0 : last = mount_setattr_prepare(kattr, mnt, &err);
3990 0 : if (last) /* Commit all changes or revert to the old state. */
3991 0 : mount_setattr_commit(kattr, mnt, last, err);
3992 :
3993 0 : unlock_mount_hash();
3994 :
3995 0 : if (kattr->propagation) {
3996 0 : namespace_unlock();
3997 0 : if (err)
3998 0 : cleanup_group_ids(mnt, NULL);
3999 : }
4000 :
4001 0 : return err;
4002 : }
4003 :
4004 0 : static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4005 : struct mount_kattr *kattr, unsigned int flags)
4006 : {
4007 0 : int err = 0;
4008 0 : struct ns_common *ns;
4009 0 : struct user_namespace *mnt_userns;
4010 0 : struct file *file;
4011 :
4012 0 : if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
4013 : return 0;
4014 :
4015 : /*
4016 : * We currently do not support clearing an idmapped mount. If this ever
4017 : * is a use-case we can revisit this but for now let's keep it simple
4018 : * and not allow it.
4019 : */
4020 0 : if (attr->attr_clr & MOUNT_ATTR_IDMAP)
4021 : return -EINVAL;
4022 :
4023 0 : if (attr->userns_fd > INT_MAX)
4024 : return -EINVAL;
4025 :
4026 0 : file = fget(attr->userns_fd);
4027 0 : if (!file)
4028 : return -EBADF;
4029 :
4030 0 : if (!proc_ns_file(file)) {
4031 0 : err = -EINVAL;
4032 0 : goto out_fput;
4033 : }
4034 :
4035 0 : ns = get_proc_ns(file_inode(file));
4036 0 : if (ns->ops->type != CLONE_NEWUSER) {
4037 0 : err = -EINVAL;
4038 0 : goto out_fput;
4039 : }
4040 :
4041 : /*
4042 : * The init_user_ns is used to indicate that a vfsmount is not idmapped.
4043 : * This is simpler than just having to treat NULL as unmapped. Users
4044 : * wanting to idmap a mount to init_user_ns can just use a namespace
4045 : * with an identity mapping.
4046 : */
4047 0 : mnt_userns = container_of(ns, struct user_namespace, ns);
4048 0 : if (mnt_userns == &init_user_ns) {
4049 0 : err = -EPERM;
4050 0 : goto out_fput;
4051 : }
4052 0 : kattr->mnt_userns = get_user_ns(mnt_userns);
4053 :
4054 0 : out_fput:
4055 0 : fput(file);
4056 0 : return err;
4057 : }
4058 :
4059 0 : static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
4060 : struct mount_kattr *kattr, unsigned int flags)
4061 : {
4062 0 : unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
4063 :
4064 0 : if (flags & AT_NO_AUTOMOUNT)
4065 0 : lookup_flags &= ~LOOKUP_AUTOMOUNT;
4066 0 : if (flags & AT_SYMLINK_NOFOLLOW)
4067 0 : lookup_flags &= ~LOOKUP_FOLLOW;
4068 0 : if (flags & AT_EMPTY_PATH)
4069 0 : lookup_flags |= LOOKUP_EMPTY;
4070 :
4071 0 : *kattr = (struct mount_kattr) {
4072 : .lookup_flags = lookup_flags,
4073 0 : .recurse = !!(flags & AT_RECURSIVE),
4074 : };
4075 :
4076 0 : if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
4077 : return -EINVAL;
4078 0 : if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
4079 : return -EINVAL;
4080 0 : kattr->propagation = attr->propagation;
4081 :
4082 0 : if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
4083 : return -EINVAL;
4084 :
4085 0 : kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
4086 0 : kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
4087 :
4088 : /*
4089 : * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
4090 : * users wanting to transition to a different atime setting cannot
4091 : * simply specify the atime setting in @attr_set, but must also
4092 : * specify MOUNT_ATTR__ATIME in the @attr_clr field.
4093 : * So ensure that MOUNT_ATTR__ATIME can't be partially set in
4094 : * @attr_clr and that @attr_set can't have any atime bits set if
4095 : * MOUNT_ATTR__ATIME isn't set in @attr_clr.
4096 : */
4097 0 : if (attr->attr_clr & MOUNT_ATTR__ATIME) {
4098 0 : if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
4099 : return -EINVAL;
4100 :
4101 : /*
4102 : * Clear all previous time settings as they are mutually
4103 : * exclusive.
4104 : */
4105 0 : kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
4106 0 : switch (attr->attr_set & MOUNT_ATTR__ATIME) {
4107 0 : case MOUNT_ATTR_RELATIME:
4108 0 : kattr->attr_set |= MNT_RELATIME;
4109 0 : break;
4110 0 : case MOUNT_ATTR_NOATIME:
4111 0 : kattr->attr_set |= MNT_NOATIME;
4112 0 : break;
4113 : case MOUNT_ATTR_STRICTATIME:
4114 : break;
4115 : default:
4116 : return -EINVAL;
4117 : }
4118 : } else {
4119 0 : if (attr->attr_set & MOUNT_ATTR__ATIME)
4120 : return -EINVAL;
4121 : }
4122 :
4123 0 : return build_mount_idmapped(attr, usize, kattr, flags);
4124 : }
4125 :
4126 0 : static void finish_mount_kattr(struct mount_kattr *kattr)
4127 : {
4128 0 : put_user_ns(kattr->mnt_userns);
4129 0 : kattr->mnt_userns = NULL;
4130 : }
4131 :
4132 0 : SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
4133 : unsigned int, flags, struct mount_attr __user *, uattr,
4134 : size_t, usize)
4135 : {
4136 0 : int err;
4137 0 : struct path target;
4138 0 : struct mount_attr attr;
4139 0 : struct mount_kattr kattr;
4140 :
4141 0 : BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
4142 :
4143 0 : if (flags & ~(AT_EMPTY_PATH |
4144 : AT_RECURSIVE |
4145 : AT_SYMLINK_NOFOLLOW |
4146 : AT_NO_AUTOMOUNT))
4147 : return -EINVAL;
4148 :
4149 0 : if (unlikely(usize > PAGE_SIZE))
4150 : return -E2BIG;
4151 0 : if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
4152 : return -EINVAL;
4153 :
4154 0 : if (!may_mount())
4155 : return -EPERM;
4156 :
4157 0 : err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
4158 0 : if (err)
4159 0 : return err;
4160 :
4161 : /* Don't bother walking through the mounts if this is a nop. */
4162 0 : if (attr.attr_set == 0 &&
4163 0 : attr.attr_clr == 0 &&
4164 0 : attr.propagation == 0)
4165 : return 0;
4166 :
4167 0 : err = build_mount_kattr(&attr, usize, &kattr, flags);
4168 0 : if (err)
4169 0 : return err;
4170 :
4171 0 : err = user_path_at(dfd, path, kattr.lookup_flags, &target);
4172 0 : if (err)
4173 0 : return err;
4174 :
4175 0 : err = do_mount_setattr(&target, &kattr);
4176 0 : finish_mount_kattr(&kattr);
4177 0 : path_put(&target);
4178 0 : return err;
4179 : }
4180 :
4181 1 : static void __init init_mount_tree(void)
4182 : {
4183 1 : struct vfsmount *mnt;
4184 1 : struct mount *m;
4185 1 : struct mnt_namespace *ns;
4186 1 : struct path root;
4187 :
4188 1 : mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
4189 1 : if (IS_ERR(mnt))
4190 0 : panic("Can't create rootfs");
4191 :
4192 1 : ns = alloc_mnt_ns(&init_user_ns, false);
4193 1 : if (IS_ERR(ns))
4194 0 : panic("Can't allocate initial namespace");
4195 1 : m = real_mount(mnt);
4196 1 : m->mnt_ns = ns;
4197 1 : ns->root = m;
4198 1 : ns->mounts = 1;
4199 1 : list_add(&m->mnt_list, &ns->list);
4200 1 : init_task.nsproxy->mnt_ns = ns;
4201 1 : get_mnt_ns(ns);
4202 :
4203 1 : root.mnt = mnt;
4204 1 : root.dentry = mnt->mnt_root;
4205 1 : mnt->mnt_flags |= MNT_LOCKED;
4206 :
4207 1 : set_fs_pwd(current->fs, &root);
4208 1 : set_fs_root(current->fs, &root);
4209 1 : }
4210 :
4211 1 : void __init mnt_init(void)
4212 : {
4213 1 : int err;
4214 :
4215 1 : mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
4216 : 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
4217 :
4218 1 : mount_hashtable = alloc_large_system_hash("Mount-cache",
4219 : sizeof(struct hlist_head),
4220 : mhash_entries, 19,
4221 : HASH_ZERO,
4222 : &m_hash_shift, &m_hash_mask, 0, 0);
4223 1 : mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
4224 : sizeof(struct hlist_head),
4225 : mphash_entries, 19,
4226 : HASH_ZERO,
4227 : &mp_hash_shift, &mp_hash_mask, 0, 0);
4228 :
4229 1 : if (!mount_hashtable || !mountpoint_hashtable)
4230 0 : panic("Failed to allocate mount hash table\n");
4231 :
4232 1 : kernfs_init();
4233 :
4234 1 : err = sysfs_init();
4235 1 : if (err)
4236 0 : printk(KERN_WARNING "%s: sysfs_init error: %d\n",
4237 : __func__, err);
4238 1 : fs_kobj = kobject_create_and_add("fs", NULL);
4239 1 : if (!fs_kobj)
4240 0 : printk(KERN_WARNING "%s: kobj create error\n", __func__);
4241 1 : shmem_init();
4242 1 : init_rootfs();
4243 1 : init_mount_tree();
4244 1 : }
4245 :
4246 200 : void put_mnt_ns(struct mnt_namespace *ns)
4247 : {
4248 200 : if (!refcount_dec_and_test(&ns->ns.count))
4249 : return;
4250 47 : drop_collected_mounts(&ns->root->mnt);
4251 47 : free_mnt_ns(ns);
4252 : }
4253 :
4254 6 : struct vfsmount *kern_mount(struct file_system_type *type)
4255 : {
4256 6 : struct vfsmount *mnt;
4257 6 : mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
4258 6 : if (!IS_ERR(mnt)) {
4259 : /*
4260 : * it is a longterm mount, don't release mnt until
4261 : * we unmount before file sys is unregistered
4262 : */
4263 6 : real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
4264 : }
4265 6 : return mnt;
4266 : }
4267 : EXPORT_SYMBOL_GPL(kern_mount);
4268 :
4269 0 : void kern_unmount(struct vfsmount *mnt)
4270 : {
4271 : /* release long term mount so mount point can be released */
4272 0 : if (!IS_ERR_OR_NULL(mnt)) {
4273 0 : real_mount(mnt)->mnt_ns = NULL;
4274 0 : synchronize_rcu(); /* yecchhh... */
4275 0 : mntput(mnt);
4276 : }
4277 0 : }
4278 : EXPORT_SYMBOL(kern_unmount);
4279 :
4280 2 : void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
4281 : {
4282 2 : unsigned int i;
4283 :
4284 6 : for (i = 0; i < num; i++)
4285 4 : if (mnt[i])
4286 4 : real_mount(mnt[i])->mnt_ns = NULL;
4287 2 : synchronize_rcu_expedited();
4288 8 : for (i = 0; i < num; i++)
4289 4 : mntput(mnt[i]);
4290 2 : }
4291 : EXPORT_SYMBOL(kern_unmount_array);
4292 :
4293 0 : bool our_mnt(struct vfsmount *mnt)
4294 : {
4295 0 : return check_mnt(real_mount(mnt));
4296 : }
4297 :
4298 0 : bool current_chrooted(void)
4299 : {
4300 : /* Does the current process have a non-standard root */
4301 0 : struct path ns_root;
4302 0 : struct path fs_root;
4303 0 : bool chrooted;
4304 :
4305 : /* Find the namespace root */
4306 0 : ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt;
4307 0 : ns_root.dentry = ns_root.mnt->mnt_root;
4308 0 : path_get(&ns_root);
4309 0 : while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
4310 0 : ;
4311 :
4312 0 : get_fs_root(current->fs, &fs_root);
4313 :
4314 0 : chrooted = !path_equal(&fs_root, &ns_root);
4315 :
4316 0 : path_put(&fs_root);
4317 0 : path_put(&ns_root);
4318 :
4319 0 : return chrooted;
4320 : }
4321 :
4322 0 : static bool mnt_already_visible(struct mnt_namespace *ns,
4323 : const struct super_block *sb,
4324 : int *new_mnt_flags)
4325 : {
4326 0 : int new_flags = *new_mnt_flags;
4327 0 : struct mount *mnt;
4328 0 : bool visible = false;
4329 :
4330 0 : down_read(&namespace_sem);
4331 0 : lock_ns_list(ns);
4332 0 : list_for_each_entry(mnt, &ns->list, mnt_list) {
4333 0 : struct mount *child;
4334 0 : int mnt_flags;
4335 :
4336 0 : if (mnt_is_cursor(mnt))
4337 0 : continue;
4338 :
4339 0 : if (mnt->mnt.mnt_sb->s_type != sb->s_type)
4340 0 : continue;
4341 :
4342 : /* This mount is not fully visible if it's root directory
4343 : * is not the root directory of the filesystem.
4344 : */
4345 0 : if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
4346 0 : continue;
4347 :
4348 : /* A local view of the mount flags */
4349 0 : mnt_flags = mnt->mnt.mnt_flags;
4350 :
4351 : /* Don't miss readonly hidden in the superblock flags */
4352 0 : if (sb_rdonly(mnt->mnt.mnt_sb))
4353 0 : mnt_flags |= MNT_LOCK_READONLY;
4354 :
4355 : /* Verify the mount flags are equal to or more permissive
4356 : * than the proposed new mount.
4357 : */
4358 0 : if ((mnt_flags & MNT_LOCK_READONLY) &&
4359 0 : !(new_flags & MNT_READONLY))
4360 0 : continue;
4361 0 : if ((mnt_flags & MNT_LOCK_ATIME) &&
4362 0 : ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
4363 0 : continue;
4364 :
4365 : /* This mount is not fully visible if there are any
4366 : * locked child mounts that cover anything except for
4367 : * empty directories.
4368 : */
4369 0 : list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
4370 0 : struct inode *inode = child->mnt_mountpoint->d_inode;
4371 : /* Only worry about locked mounts */
4372 0 : if (!(child->mnt.mnt_flags & MNT_LOCKED))
4373 0 : continue;
4374 : /* Is the directory permanetly empty? */
4375 0 : if (!is_empty_dir_inode(inode))
4376 0 : goto next;
4377 : }
4378 : /* Preserve the locked attributes */
4379 0 : *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
4380 : MNT_LOCK_ATIME);
4381 0 : visible = true;
4382 0 : goto found;
4383 0 : next: ;
4384 : }
4385 0 : found:
4386 0 : unlock_ns_list(ns);
4387 0 : up_read(&namespace_sem);
4388 0 : return visible;
4389 : }
4390 :
4391 115 : static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
4392 : {
4393 115 : const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
4394 115 : struct mnt_namespace *ns = current->nsproxy->mnt_ns;
4395 115 : unsigned long s_iflags;
4396 :
4397 115 : if (ns->user_ns == &init_user_ns)
4398 : return false;
4399 :
4400 : /* Can this filesystem be too revealing? */
4401 0 : s_iflags = sb->s_iflags;
4402 0 : if (!(s_iflags & SB_I_USERNS_VISIBLE))
4403 : return false;
4404 :
4405 0 : if ((s_iflags & required_iflags) != required_iflags) {
4406 0 : WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
4407 : required_iflags);
4408 0 : return true;
4409 : }
4410 :
4411 0 : return !mnt_already_visible(ns, sb, new_mnt_flags);
4412 : }
4413 :
4414 2094 : bool mnt_may_suid(struct vfsmount *mnt)
4415 : {
4416 : /*
4417 : * Foreign mounts (accessed via fchdir or through /proc
4418 : * symlinks) are always treated as if they are nosuid. This
4419 : * prevents namespaces from trusting potentially unsafe
4420 : * suid/sgid bits, file caps, or security labels that originate
4421 : * in other namespaces.
4422 : */
4423 2094 : return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
4424 2094 : current_in_userns(mnt->mnt_sb->s_user_ns);
4425 : }
4426 :
4427 3 : static struct ns_common *mntns_get(struct task_struct *task)
4428 : {
4429 3 : struct ns_common *ns = NULL;
4430 3 : struct nsproxy *nsproxy;
4431 :
4432 3 : task_lock(task);
4433 3 : nsproxy = task->nsproxy;
4434 3 : if (nsproxy) {
4435 3 : ns = &nsproxy->mnt_ns->ns;
4436 3 : get_mnt_ns(to_mnt_ns(ns));
4437 : }
4438 3 : task_unlock(task);
4439 :
4440 3 : return ns;
4441 : }
4442 :
4443 3 : static void mntns_put(struct ns_common *ns)
4444 : {
4445 3 : put_mnt_ns(to_mnt_ns(ns));
4446 3 : }
4447 :
4448 0 : static int mntns_install(struct nsset *nsset, struct ns_common *ns)
4449 : {
4450 0 : struct nsproxy *nsproxy = nsset->nsproxy;
4451 0 : struct fs_struct *fs = nsset->fs;
4452 0 : struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
4453 0 : struct user_namespace *user_ns = nsset->cred->user_ns;
4454 0 : struct path root;
4455 0 : int err;
4456 :
4457 0 : if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
4458 0 : !ns_capable(user_ns, CAP_SYS_CHROOT) ||
4459 0 : !ns_capable(user_ns, CAP_SYS_ADMIN))
4460 0 : return -EPERM;
4461 :
4462 0 : if (is_anon_ns(mnt_ns))
4463 : return -EINVAL;
4464 :
4465 0 : if (fs->users != 1)
4466 : return -EINVAL;
4467 :
4468 0 : get_mnt_ns(mnt_ns);
4469 0 : old_mnt_ns = nsproxy->mnt_ns;
4470 0 : nsproxy->mnt_ns = mnt_ns;
4471 :
4472 : /* Find the root */
4473 0 : err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
4474 : "/", LOOKUP_DOWN, &root);
4475 0 : if (err) {
4476 : /* revert to old namespace */
4477 0 : nsproxy->mnt_ns = old_mnt_ns;
4478 0 : put_mnt_ns(mnt_ns);
4479 0 : return err;
4480 : }
4481 :
4482 0 : put_mnt_ns(old_mnt_ns);
4483 :
4484 : /* Update the pwd and root */
4485 0 : set_fs_pwd(fs, &root);
4486 0 : set_fs_root(fs, &root);
4487 :
4488 0 : path_put(&root);
4489 0 : return 0;
4490 : }
4491 :
4492 0 : static struct user_namespace *mntns_owner(struct ns_common *ns)
4493 : {
4494 0 : return to_mnt_ns(ns)->user_ns;
4495 : }
4496 :
4497 : const struct proc_ns_operations mntns_operations = {
4498 : .name = "mnt",
4499 : .type = CLONE_NEWNS,
4500 : .get = mntns_get,
4501 : .put = mntns_put,
4502 : .install = mntns_install,
4503 : .owner = mntns_owner,
4504 : };
|