LCOV - landlock.info - kernel/cgroup/cgroup-v1.c

LCOV - code coverage report

Current view:	top level - kernel/cgroup - cgroup-v1.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	140	552	25.4 %
Date:	2021-04-22 12:43:58	Functions:	11	37	29.7 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : #include "cgroup-internal.h"
       3             : 
       4             : #include <linux/ctype.h>
       5             : #include <linux/kmod.h>
       6             : #include <linux/sort.h>
       7             : #include <linux/delay.h>
       8             : #include <linux/mm.h>
       9             : #include <linux/sched/signal.h>
      10             : #include <linux/sched/task.h>
      11             : #include <linux/magic.h>
      12             : #include <linux/slab.h>
      13             : #include <linux/vmalloc.h>
      14             : #include <linux/delayacct.h>
      15             : #include <linux/pid_namespace.h>
      16             : #include <linux/cgroupstats.h>
      17             : #include <linux/fs_parser.h>
      18             : 
      19             : #include <trace/events/cgroup.h>
      20             : 
      21             : /*
      22             :  * pidlists linger the following amount before being destroyed.  The goal
      23             :  * is avoiding frequent destruction in the middle of consecutive read calls
      24             :  * Expiring in the middle is a performance problem not a correctness one.
      25             :  * 1 sec should be enough.
      26             :  */
      27             : #define CGROUP_PIDLIST_DESTROY_DELAY    HZ
      28             : 
      29             : /* Controllers blocked by the commandline in v1 */
      30             : static u16 cgroup_no_v1_mask;
      31             : 
      32             : /* disable named v1 mounts */
      33             : static bool cgroup_no_v1_named;
      34             : 
      35             : /*
      36             :  * pidlist destructions need to be flushed on cgroup destruction.  Use a
      37             :  * separate workqueue as flush domain.
      38             :  */
      39             : static struct workqueue_struct *cgroup_pidlist_destroy_wq;
      40             : 
      41             : /* protects cgroup_subsys->release_agent_path */
      42             : static DEFINE_SPINLOCK(release_agent_path_lock);
      43             : 
      44           0 : bool cgroup1_ssid_disabled(int ssid)
      45             : {
      46           0 :         return cgroup_no_v1_mask & (1 << ssid);
      47             : }
      48             : 
      49             : /**
      50             :  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
      51             :  * @from: attach to all cgroups of a given task
      52             :  * @tsk: the task to be attached
      53             :  */
      54           0 : int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
      55             : {
      56           0 :         struct cgroup_root *root;
      57           0 :         int retval = 0;
      58             : 
      59           0 :         mutex_lock(&cgroup_mutex);
      60           0 :         percpu_down_write(&cgroup_threadgroup_rwsem);
      61           0 :         for_each_root(root) {
      62           0 :                 struct cgroup *from_cgrp;
      63             : 
      64           0 :                 if (root == &cgrp_dfl_root)
      65           0 :                         continue;
      66             : 
      67           0 :                 spin_lock_irq(&css_set_lock);
      68           0 :                 from_cgrp = task_cgroup_from_root(from, root);
      69           0 :                 spin_unlock_irq(&css_set_lock);
      70             : 
      71           0 :                 retval = cgroup_attach_task(from_cgrp, tsk, false);
      72           0 :                 if (retval)
      73             :                         break;
      74             :         }
      75           0 :         percpu_up_write(&cgroup_threadgroup_rwsem);
      76           0 :         mutex_unlock(&cgroup_mutex);
      77             : 
      78           0 :         return retval;
      79             : }
      80             : EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
      81             : 
      82             : /**
      83             :  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
      84             :  * @to: cgroup to which the tasks will be moved
      85             :  * @from: cgroup in which the tasks currently reside
      86             :  *
      87             :  * Locking rules between cgroup_post_fork() and the migration path
      88             :  * guarantee that, if a task is forking while being migrated, the new child
      89             :  * is guaranteed to be either visible in the source cgroup after the
      90             :  * parent's migration is complete or put into the target cgroup.  No task
      91             :  * can slip out of migration through forking.
      92             :  */
      93           0 : int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
      94             : {
      95           0 :         DEFINE_CGROUP_MGCTX(mgctx);
      96           0 :         struct cgrp_cset_link *link;
      97           0 :         struct css_task_iter it;
      98           0 :         struct task_struct *task;
      99           0 :         int ret;
     100             : 
     101           0 :         if (cgroup_on_dfl(to))
     102             :                 return -EINVAL;
     103             : 
     104           0 :         ret = cgroup_migrate_vet_dst(to);
     105           0 :         if (ret)
     106             :                 return ret;
     107             : 
     108           0 :         mutex_lock(&cgroup_mutex);
     109             : 
     110           0 :         percpu_down_write(&cgroup_threadgroup_rwsem);
     111             : 
     112             :         /* all tasks in @from are being moved, all csets are source */
     113           0 :         spin_lock_irq(&css_set_lock);
     114           0 :         list_for_each_entry(link, &from->cset_links, cset_link)
     115           0 :                 cgroup_migrate_add_src(link->cset, to, &mgctx);
     116           0 :         spin_unlock_irq(&css_set_lock);
     117             : 
     118           0 :         ret = cgroup_migrate_prepare_dst(&mgctx);
     119           0 :         if (ret)
     120           0 :                 goto out_err;
     121             : 
     122             :         /*
     123             :          * Migrate tasks one-by-one until @from is empty.  This fails iff
     124             :          * ->can_attach() fails.
     125             :          */
     126           0 :         do {
     127           0 :                 css_task_iter_start(&from->self, 0, &it);
     128             : 
     129           0 :                 do {
     130           0 :                         task = css_task_iter_next(&it);
     131           0 :                 } while (task && (task->flags & PF_EXITING));
     132             : 
     133           0 :                 if (task)
     134           0 :                         get_task_struct(task);
     135           0 :                 css_task_iter_end(&it);
     136             : 
     137           0 :                 if (task) {
     138           0 :                         ret = cgroup_migrate(task, false, &mgctx);
     139           0 :                         if (!ret)
     140           0 :                                 TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
     141           0 :                         put_task_struct(task);
     142             :                 }
     143           0 :         } while (task && !ret);
     144           0 : out_err:
     145           0 :         cgroup_migrate_finish(&mgctx);
     146           0 :         percpu_up_write(&cgroup_threadgroup_rwsem);
     147           0 :         mutex_unlock(&cgroup_mutex);
     148           0 :         return ret;
     149             : }
     150             : 
     151             : /*
     152             :  * Stuff for reading the 'tasks'/'procs' files.
     153             :  *
     154             :  * Reading this file can return large amounts of data if a cgroup has
     155             :  * *lots* of attached tasks. So it may need several calls to read(),
     156             :  * but we cannot guarantee that the information we produce is correct
     157             :  * unless we produce it entirely atomically.
     158             :  *
     159             :  */
     160             : 
     161             : /* which pidlist file are we talking about? */
     162             : enum cgroup_filetype {
     163             :         CGROUP_FILE_PROCS,
     164             :         CGROUP_FILE_TASKS,
     165             : };
     166             : 
     167             : /*
     168             :  * A pidlist is a list of pids that virtually represents the contents of one
     169             :  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
     170             :  * a pair (one each for procs, tasks) for each pid namespace that's relevant
     171             :  * to the cgroup.
     172             :  */
     173             : struct cgroup_pidlist {
     174             :         /*
     175             :          * used to find which pidlist is wanted. doesn't change as long as
     176             :          * this particular list stays in the list.
     177             :         */
     178             :         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
     179             :         /* array of xids */
     180             :         pid_t *list;
     181             :         /* how many elements the above list has */
     182             :         int length;
     183             :         /* each of these stored in a list by its cgroup */
     184             :         struct list_head links;
     185             :         /* pointer to the cgroup we belong to, for list removal purposes */
     186             :         struct cgroup *owner;
     187             :         /* for delayed destruction */
     188             :         struct delayed_work destroy_dwork;
     189             : };
     190             : 
     191             : /*
     192             :  * Used to destroy all pidlists lingering waiting for destroy timer.  None
     193             :  * should be left afterwards.
     194             :  */
     195          50 : void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
     196             : {
     197          50 :         struct cgroup_pidlist *l, *tmp_l;
     198             : 
     199          50 :         mutex_lock(&cgrp->pidlist_mutex);
     200          50 :         list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
     201           0 :                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
     202          50 :         mutex_unlock(&cgrp->pidlist_mutex);
     203             : 
     204          50 :         flush_workqueue(cgroup_pidlist_destroy_wq);
     205          50 :         BUG_ON(!list_empty(&cgrp->pidlists));
     206          50 : }
     207             : 
     208           0 : static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
     209             : {
     210           0 :         struct delayed_work *dwork = to_delayed_work(work);
     211           0 :         struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
     212             :                                                 destroy_dwork);
     213           0 :         struct cgroup_pidlist *tofree = NULL;
     214             : 
     215           0 :         mutex_lock(&l->owner->pidlist_mutex);
     216             : 
     217             :         /*
     218             :          * Destroy iff we didn't get queued again.  The state won't change
     219             :          * as destroy_dwork can only be queued while locked.
     220             :          */
     221           0 :         if (!delayed_work_pending(dwork)) {
     222           0 :                 list_del(&l->links);
     223           0 :                 kvfree(l->list);
     224           0 :                 put_pid_ns(l->key.ns);
     225           0 :                 tofree = l;
     226             :         }
     227             : 
     228           0 :         mutex_unlock(&l->owner->pidlist_mutex);
     229           0 :         kfree(tofree);
     230           0 : }
     231             : 
     232             : /*
     233             :  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
     234             :  * Returns the number of unique elements.
     235             :  */
     236           0 : static int pidlist_uniq(pid_t *list, int length)
     237             : {
     238           0 :         int src, dest = 1;
     239             : 
     240             :         /*
     241             :          * we presume the 0th element is unique, so i starts at 1. trivial
     242             :          * edge cases first; no work needs to be done for either
     243             :          */
     244           0 :         if (length == 0 || length == 1)
     245             :                 return length;
     246             :         /* src and dest walk down the list; dest counts unique elements */
     247           0 :         for (src = 1; src < length; src++) {
     248             :                 /* find next unique element */
     249           0 :                 while (list[src] == list[src-1]) {
     250           0 :                         src++;
     251           0 :                         if (src == length)
     252           0 :                                 goto after;
     253             :                 }
     254             :                 /* dest always points to where the next unique element goes */
     255           0 :                 list[dest] = list[src];
     256           0 :                 dest++;
     257             :         }
     258           0 : after:
     259             :         return dest;
     260             : }
     261             : 
     262             : /*
     263             :  * The two pid files - task and cgroup.procs - guaranteed that the result
     264             :  * is sorted, which forced this whole pidlist fiasco.  As pid order is
     265             :  * different per namespace, each namespace needs differently sorted list,
     266             :  * making it impossible to use, for example, single rbtree of member tasks
     267             :  * sorted by task pointer.  As pidlists can be fairly large, allocating one
     268             :  * per open file is dangerous, so cgroup had to implement shared pool of
     269             :  * pidlists keyed by cgroup and namespace.
     270             :  */
     271           0 : static int cmppid(const void *a, const void *b)
     272             : {
     273           0 :         return *(pid_t *)a - *(pid_t *)b;
     274             : }
     275             : 
     276           0 : static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
     277             :                                                   enum cgroup_filetype type)
     278             : {
     279           0 :         struct cgroup_pidlist *l;
     280             :         /* don't need task_nsproxy() if we're looking at ourself */
     281           0 :         struct pid_namespace *ns = task_active_pid_ns(current);
     282             : 
     283           0 :         lockdep_assert_held(&cgrp->pidlist_mutex);
     284             : 
     285           0 :         list_for_each_entry(l, &cgrp->pidlists, links)
     286           0 :                 if (l->key.type == type && l->key.ns == ns)
     287           0 :                         return l;
     288             :         return NULL;
     289             : }
     290             : 
     291             : /*
     292             :  * find the appropriate pidlist for our purpose (given procs vs tasks)
     293             :  * returns with the lock on that pidlist already held, and takes care
     294             :  * of the use count, or returns NULL with no locks held if we're out of
     295             :  * memory.
     296             :  */
     297           0 : static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
     298             :                                                 enum cgroup_filetype type)
     299             : {
     300           0 :         struct cgroup_pidlist *l;
     301             : 
     302           0 :         lockdep_assert_held(&cgrp->pidlist_mutex);
     303             : 
     304           0 :         l = cgroup_pidlist_find(cgrp, type);
     305           0 :         if (l)
     306             :                 return l;
     307             : 
     308             :         /* entry not found; create a new one */
     309           0 :         l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
     310           0 :         if (!l)
     311             :                 return l;
     312             : 
     313           0 :         INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
     314           0 :         l->key.type = type;
     315             :         /* don't need task_nsproxy() if we're looking at ourself */
     316           0 :         l->key.ns = get_pid_ns(task_active_pid_ns(current));
     317           0 :         l->owner = cgrp;
     318           0 :         list_add(&l->links, &cgrp->pidlists);
     319           0 :         return l;
     320             : }
     321             : 
     322             : /*
     323             :  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
     324             :  */
     325           0 : static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
     326             :                               struct cgroup_pidlist **lp)
     327             : {
     328           0 :         pid_t *array;
     329           0 :         int length;
     330           0 :         int pid, n = 0; /* used for populating the array */
     331           0 :         struct css_task_iter it;
     332           0 :         struct task_struct *tsk;
     333           0 :         struct cgroup_pidlist *l;
     334             : 
     335           0 :         lockdep_assert_held(&cgrp->pidlist_mutex);
     336             : 
     337             :         /*
     338             :          * If cgroup gets more users after we read count, we won't have
     339             :          * enough space - tough.  This race is indistinguishable to the
     340             :          * caller from the case that the additional cgroup users didn't
     341             :          * show up until sometime later on.
     342             :          */
     343           0 :         length = cgroup_task_count(cgrp);
     344           0 :         array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
     345           0 :         if (!array)
     346           0 :                 return -ENOMEM;
     347             :         /* now, populate the array */
     348           0 :         css_task_iter_start(&cgrp->self, 0, &it);
     349           0 :         while ((tsk = css_task_iter_next(&it))) {
     350           0 :                 if (unlikely(n == length))
     351             :                         break;
     352             :                 /* get tgid or pid for procs or tasks file respectively */
     353           0 :                 if (type == CGROUP_FILE_PROCS)
     354           0 :                         pid = task_tgid_vnr(tsk);
     355             :                 else
     356           0 :                         pid = task_pid_vnr(tsk);
     357           0 :                 if (pid > 0) /* make sure to only use valid results */
     358           0 :                         array[n++] = pid;
     359             :         }
     360           0 :         css_task_iter_end(&it);
     361           0 :         length = n;
     362             :         /* now sort & (if procs) strip out duplicates */
     363           0 :         sort(array, length, sizeof(pid_t), cmppid, NULL);
     364           0 :         if (type == CGROUP_FILE_PROCS)
     365           0 :                 length = pidlist_uniq(array, length);
     366             : 
     367           0 :         l = cgroup_pidlist_find_create(cgrp, type);
     368           0 :         if (!l) {
     369           0 :                 kvfree(array);
     370           0 :                 return -ENOMEM;
     371             :         }
     372             : 
     373             :         /* store array, freeing old if necessary */
     374           0 :         kvfree(l->list);
     375           0 :         l->list = array;
     376           0 :         l->length = length;
     377           0 :         *lp = l;
     378           0 :         return 0;
     379             : }
     380             : 
     381             : /*
     382             :  * seq_file methods for the tasks/procs files. The seq_file position is the
     383             :  * next pid to display; the seq_file iterator is a pointer to the pid
     384             :  * in the cgroup->l->list array.
     385             :  */
     386             : 
     387           0 : static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
     388             : {
     389             :         /*
     390             :          * Initially we receive a position value that corresponds to
     391             :          * one more than the last pid shown (or 0 on the first call or
     392             :          * after a seek to the start). Use a binary-search to find the
     393             :          * next pid to display, if any
     394             :          */
     395           0 :         struct kernfs_open_file *of = s->private;
     396           0 :         struct cgroup *cgrp = seq_css(s)->cgroup;
     397           0 :         struct cgroup_pidlist *l;
     398           0 :         enum cgroup_filetype type = seq_cft(s)->private;
     399           0 :         int index = 0, pid = *pos;
     400           0 :         int *iter, ret;
     401             : 
     402           0 :         mutex_lock(&cgrp->pidlist_mutex);
     403             : 
     404             :         /*
     405             :          * !NULL @of->priv indicates that this isn't the first start()
     406             :          * after open.  If the matching pidlist is around, we can use that.
     407             :          * Look for it.  Note that @of->priv can't be used directly.  It
     408             :          * could already have been destroyed.
     409             :          */
     410           0 :         if (of->priv)
     411           0 :                 of->priv = cgroup_pidlist_find(cgrp, type);
     412             : 
     413             :         /*
     414             :          * Either this is the first start() after open or the matching
     415             :          * pidlist has been destroyed inbetween.  Create a new one.
     416             :          */
     417           0 :         if (!of->priv) {
     418           0 :                 ret = pidlist_array_load(cgrp, type,
     419           0 :                                          (struct cgroup_pidlist **)&of->priv);
     420           0 :                 if (ret)
     421           0 :                         return ERR_PTR(ret);
     422             :         }
     423           0 :         l = of->priv;
     424             : 
     425           0 :         if (pid) {
     426           0 :                 int end = l->length;
     427             : 
     428           0 :                 while (index < end) {
     429           0 :                         int mid = (index + end) / 2;
     430           0 :                         if (l->list[mid] == pid) {
     431             :                                 index = mid;
     432             :                                 break;
     433           0 :                         } else if (l->list[mid] <= pid)
     434           0 :                                 index = mid + 1;
     435             :                         else
     436             :                                 end = mid;
     437             :                 }
     438             :         }
     439             :         /* If we're off the end of the array, we're done */
     440           0 :         if (index >= l->length)
     441             :                 return NULL;
     442             :         /* Update the abstract position to be the actual pid that we found */
     443           0 :         iter = l->list + index;
     444           0 :         *pos = *iter;
     445           0 :         return iter;
     446             : }
     447             : 
     448           0 : static void cgroup_pidlist_stop(struct seq_file *s, void *v)
     449             : {
     450           0 :         struct kernfs_open_file *of = s->private;
     451           0 :         struct cgroup_pidlist *l = of->priv;
     452             : 
     453           0 :         if (l)
     454           0 :                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
     455             :                                  CGROUP_PIDLIST_DESTROY_DELAY);
     456           0 :         mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
     457           0 : }
     458             : 
     459           0 : static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
     460             : {
     461           0 :         struct kernfs_open_file *of = s->private;
     462           0 :         struct cgroup_pidlist *l = of->priv;
     463           0 :         pid_t *p = v;
     464           0 :         pid_t *end = l->list + l->length;
     465             :         /*
     466             :          * Advance to the next pid in the array. If this goes off the
     467             :          * end, we're done
     468             :          */
     469           0 :         p++;
     470           0 :         if (p >= end) {
     471           0 :                 (*pos)++;
     472           0 :                 return NULL;
     473             :         } else {
     474           0 :                 *pos = *p;
     475           0 :                 return p;
     476             :         }
     477             : }
     478             : 
     479           0 : static int cgroup_pidlist_show(struct seq_file *s, void *v)
     480             : {
     481           0 :         seq_printf(s, "%d\n", *(int *)v);
     482             : 
     483           0 :         return 0;
     484             : }
     485             : 
     486          99 : static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
     487             :                                      char *buf, size_t nbytes, loff_t off,
     488             :                                      bool threadgroup)
     489             : {
     490          99 :         struct cgroup *cgrp;
     491          99 :         struct task_struct *task;
     492          99 :         const struct cred *cred, *tcred;
     493          99 :         ssize_t ret;
     494          99 :         bool locked;
     495             : 
     496          99 :         cgrp = cgroup_kn_lock_live(of->kn, false);
     497          99 :         if (!cgrp)
     498             :                 return -ENODEV;
     499             : 
     500          99 :         task = cgroup_procs_write_start(buf, threadgroup, &locked);
     501          99 :         ret = PTR_ERR_OR_ZERO(task);
     502          99 :         if (ret)
     503           0 :                 goto out_unlock;
     504             : 
     505             :         /*
     506             :          * Even if we're attaching all tasks in the thread group, we only
     507             :          * need to check permissions on one of them.
     508             :          */
     509          99 :         cred = current_cred();
     510          99 :         tcred = get_task_cred(task);
     511          99 :         if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
     512           0 :             !uid_eq(cred->euid, tcred->uid) &&
     513           0 :             !uid_eq(cred->euid, tcred->suid))
     514           0 :                 ret = -EACCES;
     515          99 :         put_cred(tcred);
     516          99 :         if (ret)
     517           0 :                 goto out_finish;
     518             : 
     519          99 :         ret = cgroup_attach_task(cgrp, task, threadgroup);
     520             : 
     521          99 : out_finish:
     522          99 :         cgroup_procs_write_finish(task, locked);
     523          99 : out_unlock:
     524          99 :         cgroup_kn_unlock(of->kn);
     525             : 
     526          99 :         return ret ?: nbytes;
     527             : }
     528             : 
     529          99 : static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
     530             :                                    char *buf, size_t nbytes, loff_t off)
     531             : {
     532          99 :         return __cgroup1_procs_write(of, buf, nbytes, off, true);
     533             : }
     534             : 
     535           0 : static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
     536             :                                    char *buf, size_t nbytes, loff_t off)
     537             : {
     538           0 :         return __cgroup1_procs_write(of, buf, nbytes, off, false);
     539             : }
     540             : 
     541           0 : static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
     542             :                                           char *buf, size_t nbytes, loff_t off)
     543             : {
     544           0 :         struct cgroup *cgrp;
     545             : 
     546           0 :         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
     547             : 
     548           0 :         cgrp = cgroup_kn_lock_live(of->kn, false);
     549           0 :         if (!cgrp)
     550             :                 return -ENODEV;
     551           0 :         spin_lock(&release_agent_path_lock);
     552           0 :         strlcpy(cgrp->root->release_agent_path, strstrip(buf),
     553             :                 sizeof(cgrp->root->release_agent_path));
     554           0 :         spin_unlock(&release_agent_path_lock);
     555           0 :         cgroup_kn_unlock(of->kn);
     556           0 :         return nbytes;
     557             : }
     558             : 
     559           0 : static int cgroup_release_agent_show(struct seq_file *seq, void *v)
     560             : {
     561           0 :         struct cgroup *cgrp = seq_css(seq)->cgroup;
     562             : 
     563           0 :         spin_lock(&release_agent_path_lock);
     564           0 :         seq_puts(seq, cgrp->root->release_agent_path);
     565           0 :         spin_unlock(&release_agent_path_lock);
     566           0 :         seq_putc(seq, '\n');
     567           0 :         return 0;
     568             : }
     569             : 
     570           0 : static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
     571             : {
     572           0 :         seq_puts(seq, "0\n");
     573           0 :         return 0;
     574             : }
     575             : 
     576           0 : static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
     577             :                                          struct cftype *cft)
     578             : {
     579           0 :         return notify_on_release(css->cgroup);
     580             : }
     581             : 
     582           0 : static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
     583             :                                           struct cftype *cft, u64 val)
     584             : {
     585           0 :         if (val)
     586           0 :                 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
     587             :         else
     588           0 :                 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
     589           0 :         return 0;
     590             : }
     591             : 
     592           0 : static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
     593             :                                       struct cftype *cft)
     594             : {
     595           0 :         return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
     596             : }
     597             : 
     598           0 : static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
     599             :                                        struct cftype *cft, u64 val)
     600             : {
     601           0 :         if (val)
     602           0 :                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
     603             :         else
     604           0 :                 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
     605           0 :         return 0;
     606             : }
     607             : 
     608             : /* cgroup core interface files for the legacy hierarchies */
     609             : struct cftype cgroup1_base_files[] = {
     610             :         {
     611             :                 .name = "cgroup.procs",
     612             :                 .seq_start = cgroup_pidlist_start,
     613             :                 .seq_next = cgroup_pidlist_next,
     614             :                 .seq_stop = cgroup_pidlist_stop,
     615             :                 .seq_show = cgroup_pidlist_show,
     616             :                 .private = CGROUP_FILE_PROCS,
     617             :                 .write = cgroup1_procs_write,
     618             :         },
     619             :         {
     620             :                 .name = "cgroup.clone_children",
     621             :                 .read_u64 = cgroup_clone_children_read,
     622             :                 .write_u64 = cgroup_clone_children_write,
     623             :         },
     624             :         {
     625             :                 .name = "cgroup.sane_behavior",
     626             :                 .flags = CFTYPE_ONLY_ON_ROOT,
     627             :                 .seq_show = cgroup_sane_behavior_show,
     628             :         },
     629             :         {
     630             :                 .name = "tasks",
     631             :                 .seq_start = cgroup_pidlist_start,
     632             :                 .seq_next = cgroup_pidlist_next,
     633             :                 .seq_stop = cgroup_pidlist_stop,
     634             :                 .seq_show = cgroup_pidlist_show,
     635             :                 .private = CGROUP_FILE_TASKS,
     636             :                 .write = cgroup1_tasks_write,
     637             :         },
     638             :         {
     639             :                 .name = "notify_on_release",
     640             :                 .read_u64 = cgroup_read_notify_on_release,
     641             :                 .write_u64 = cgroup_write_notify_on_release,
     642             :         },
     643             :         {
     644             :                 .name = "release_agent",
     645             :                 .flags = CFTYPE_ONLY_ON_ROOT,
     646             :                 .seq_show = cgroup_release_agent_show,
     647             :                 .write = cgroup_release_agent_write,
     648             :                 .max_write_len = PATH_MAX - 1,
     649             :         },
     650             :         { }     /* terminate */
     651             : };
     652             : 
     653             : /* Display information about each subsystem and each hierarchy */
     654           1 : int proc_cgroupstats_show(struct seq_file *m, void *v)
     655             : {
     656           1 :         struct cgroup_subsys *ss;
     657           1 :         int i;
     658             : 
     659           1 :         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
     660             :         /*
     661             :          * ideally we don't want subsystems moving around while we do this.
     662             :          * cgroup_mutex is also necessary to guarantee an atomic snapshot of
     663             :          * subsys/hierarchy state.
     664             :          */
     665           1 :         mutex_lock(&cgroup_mutex);
     666             : 
     667           1 :         for_each_subsys(ss, i)
     668             :                 seq_printf(m, "%s\t%d\t%d\t%d\n",
     669             :                            ss->legacy_name, ss->root->hierarchy_id,
     670             :                            atomic_read(&ss->root->nr_cgrps),
     671             :                            cgroup_ssid_enabled(i));
     672             : 
     673           1 :         mutex_unlock(&cgroup_mutex);
     674           1 :         return 0;
     675             : }
     676             : 
     677             : /**
     678             :  * cgroupstats_build - build and fill cgroupstats
     679             :  * @stats: cgroupstats to fill information into
     680             :  * @dentry: A dentry entry belonging to the cgroup for which stats have
     681             :  * been requested.
     682             :  *
     683             :  * Build and fill cgroupstats so that taskstats can export it to user
     684             :  * space.
     685             :  */
     686           0 : int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
     687             : {
     688           0 :         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
     689           0 :         struct cgroup *cgrp;
     690           0 :         struct css_task_iter it;
     691           0 :         struct task_struct *tsk;
     692             : 
     693             :         /* it should be kernfs_node belonging to cgroupfs and is a directory */
     694           0 :         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
     695           0 :             kernfs_type(kn) != KERNFS_DIR)
     696             :                 return -EINVAL;
     697             : 
     698           0 :         mutex_lock(&cgroup_mutex);
     699             : 
     700             :         /*
     701             :          * We aren't being called from kernfs and there's no guarantee on
     702             :          * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
     703             :          * @kn->priv is RCU safe.  Let's do the RCU dancing.
     704             :          */
     705           0 :         rcu_read_lock();
     706           0 :         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
     707           0 :         if (!cgrp || cgroup_is_dead(cgrp)) {
     708           0 :                 rcu_read_unlock();
     709           0 :                 mutex_unlock(&cgroup_mutex);
     710           0 :                 return -ENOENT;
     711             :         }
     712           0 :         rcu_read_unlock();
     713             : 
     714           0 :         css_task_iter_start(&cgrp->self, 0, &it);
     715           0 :         while ((tsk = css_task_iter_next(&it))) {
     716           0 :                 switch (tsk->state) {
     717           0 :                 case TASK_RUNNING:
     718           0 :                         stats->nr_running++;
     719           0 :                         break;
     720           0 :                 case TASK_INTERRUPTIBLE:
     721           0 :                         stats->nr_sleeping++;
     722           0 :                         break;
     723           0 :                 case TASK_UNINTERRUPTIBLE:
     724           0 :                         stats->nr_uninterruptible++;
     725           0 :                         break;
     726           0 :                 case TASK_STOPPED:
     727           0 :                         stats->nr_stopped++;
     728           0 :                         break;
     729             :                 default:
     730           0 :                         if (delayacct_is_task_waiting_on_io(tsk))
     731           0 :                                 stats->nr_io_wait++;
     732             :                         break;
     733             :                 }
     734             :         }
     735           0 :         css_task_iter_end(&it);
     736             : 
     737           0 :         mutex_unlock(&cgroup_mutex);
     738           0 :         return 0;
     739             : }
     740             : 
     741         240 : void cgroup1_check_for_release(struct cgroup *cgrp)
     742             : {
     743         240 :         if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
     744           0 :             !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
     745           0 :                 schedule_work(&cgrp->release_agent_work);
     746         240 : }
     747             : 
     748             : /*
     749             :  * Notify userspace when a cgroup is released, by running the
     750             :  * configured release agent with the name of the cgroup (path
     751             :  * relative to the root of cgroup file system) as the argument.
     752             :  *
     753             :  * Most likely, this user command will try to rmdir this cgroup.
     754             :  *
     755             :  * This races with the possibility that some other task will be
     756             :  * attached to this cgroup before it is removed, or that some other
     757             :  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
     758             :  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
     759             :  * unused, and this cgroup will be reprieved from its death sentence,
     760             :  * to continue to serve a useful existence.  Next time it's released,
     761             :  * we will get notified again, if it still has 'notify_on_release' set.
     762             :  *
     763             :  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
     764             :  * means only wait until the task is successfully execve()'d.  The
     765             :  * separate release agent task is forked by call_usermodehelper(),
     766             :  * then control in this thread returns here, without waiting for the
     767             :  * release agent task.  We don't bother to wait because the caller of
     768             :  * this routine has no use for the exit status of the release agent
     769             :  * task, so no sense holding our caller up for that.
     770             :  */
     771           0 : void cgroup1_release_agent(struct work_struct *work)
     772             : {
     773           0 :         struct cgroup *cgrp =
     774           0 :                 container_of(work, struct cgroup, release_agent_work);
     775           0 :         char *pathbuf, *agentbuf;
     776           0 :         char *argv[3], *envp[3];
     777           0 :         int ret;
     778             : 
     779             :         /* snoop agent path and exit early if empty */
     780           0 :         if (!cgrp->root->release_agent_path[0])
     781           0 :                 return;
     782             : 
     783             :         /* prepare argument buffers */
     784           0 :         pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
     785           0 :         agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
     786           0 :         if (!pathbuf || !agentbuf)
     787           0 :                 goto out_free;
     788             : 
     789           0 :         spin_lock(&release_agent_path_lock);
     790           0 :         strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
     791           0 :         spin_unlock(&release_agent_path_lock);
     792           0 :         if (!agentbuf[0])
     793           0 :                 goto out_free;
     794             : 
     795           0 :         ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
     796           0 :         if (ret < 0 || ret >= PATH_MAX)
     797           0 :                 goto out_free;
     798             : 
     799           0 :         argv[0] = agentbuf;
     800           0 :         argv[1] = pathbuf;
     801           0 :         argv[2] = NULL;
     802             : 
     803             :         /* minimal command environment */
     804           0 :         envp[0] = "HOME=/";
     805           0 :         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
     806           0 :         envp[2] = NULL;
     807             : 
     808           0 :         call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
     809           0 : out_free:
     810           0 :         kfree(agentbuf);
     811           0 :         kfree(pathbuf);
     812             : }
     813             : 
     814             : /*
     815             :  * cgroup_rename - Only allow simple rename of directories in place.
     816             :  */
     817           0 : static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
     818             :                           const char *new_name_str)
     819             : {
     820           0 :         struct cgroup *cgrp = kn->priv;
     821           0 :         int ret;
     822             : 
     823           0 :         if (kernfs_type(kn) != KERNFS_DIR)
     824             :                 return -ENOTDIR;
     825           0 :         if (kn->parent != new_parent)
     826             :                 return -EIO;
     827             : 
     828             :         /*
     829             :          * We're gonna grab cgroup_mutex which nests outside kernfs
     830             :          * active_ref.  kernfs_rename() doesn't require active_ref
     831             :          * protection.  Break them before grabbing cgroup_mutex.
     832             :          */
     833           0 :         kernfs_break_active_protection(new_parent);
     834           0 :         kernfs_break_active_protection(kn);
     835             : 
     836           0 :         mutex_lock(&cgroup_mutex);
     837             : 
     838           0 :         ret = kernfs_rename(kn, new_parent, new_name_str);
     839           0 :         if (!ret)
     840           0 :                 TRACE_CGROUP_PATH(rename, cgrp);
     841             : 
     842           0 :         mutex_unlock(&cgroup_mutex);
     843             : 
     844           0 :         kernfs_unbreak_active_protection(kn);
     845           0 :         kernfs_unbreak_active_protection(new_parent);
     846           0 :         return ret;
     847             : }
     848             : 
     849         288 : static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
     850             : {
     851         288 :         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
     852         288 :         struct cgroup_subsys *ss;
     853         288 :         int ssid;
     854             : 
     855         288 :         for_each_subsys(ss, ssid)
     856             :                 if (root->subsys_mask & (1 << ssid))
     857             :                         seq_show_option(seq, ss->legacy_name, NULL);
     858         288 :         if (root->flags & CGRP_ROOT_NOPREFIX)
     859           0 :                 seq_puts(seq, ",noprefix");
     860         288 :         if (root->flags & CGRP_ROOT_XATTR)
     861         288 :                 seq_puts(seq, ",xattr");
     862         289 :         if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
     863           0 :                 seq_puts(seq, ",cpuset_v2_mode");
     864             : 
     865         289 :         spin_lock(&release_agent_path_lock);
     866         289 :         if (strlen(root->release_agent_path))
     867           0 :                 seq_show_option(seq, "release_agent",
     868           0 :                                 root->release_agent_path);
     869         289 :         spin_unlock(&release_agent_path_lock);
     870             : 
     871         289 :         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
     872           0 :                 seq_puts(seq, ",clone_children");
     873         289 :         if (strlen(root->name))
     874         289 :                 seq_show_option(seq, "name", root->name);
     875         289 :         return 0;
     876             : }
     877             : 
     878             : enum cgroup1_param {
     879             :         Opt_all,
     880             :         Opt_clone_children,
     881             :         Opt_cpuset_v2_mode,
     882             :         Opt_name,
     883             :         Opt_none,
     884             :         Opt_noprefix,
     885             :         Opt_release_agent,
     886             :         Opt_xattr,
     887             : };
     888             : 
     889             : const struct fs_parameter_spec cgroup1_fs_parameters[] = {
     890             :         fsparam_flag  ("all",         Opt_all),
     891             :         fsparam_flag  ("clone_children", Opt_clone_children),
     892             :         fsparam_flag  ("cpuset_v2_mode", Opt_cpuset_v2_mode),
     893             :         fsparam_string("name",                Opt_name),
     894             :         fsparam_flag  ("none",                Opt_none),
     895             :         fsparam_flag  ("noprefix",    Opt_noprefix),
     896             :         fsparam_string("release_agent",       Opt_release_agent),
     897             :         fsparam_flag  ("xattr",               Opt_xattr),
     898             :         {}
     899             : };
     900             : 
     901           4 : int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
     902             : {
     903           4 :         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
     904           4 :         struct cgroup_subsys *ss;
     905           4 :         struct fs_parse_result result;
     906           4 :         int opt, i;
     907             : 
     908           4 :         opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
     909           4 :         if (opt == -ENOPARAM) {
     910           1 :                 if (strcmp(param->key, "source") == 0) {
     911           1 :                         if (fc->source)
     912           0 :                                 return invalf(fc, "Multiple sources not supported");
     913           1 :                         fc->source = param->string;
     914           1 :                         param->string = NULL;
     915           1 :                         return 0;
     916             :                 }
     917           0 :                 for_each_subsys(ss, i) {
     918             :                         if (strcmp(param->key, ss->legacy_name))
     919             :                                 continue;
     920             :                         if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
     921             :                                 return invalfc(fc, "Disabled controller '%s'",
     922             :                                                param->key);
     923             :                         ctx->subsys_mask |= (1 << i);
     924             :                         return 0;
     925             :                 }
     926           0 :                 return invalfc(fc, "Unknown subsys name '%s'", param->key);
     927             :         }
     928           3 :         if (opt < 0)
     929             :                 return opt;
     930             : 
     931           3 :         switch (opt) {
     932           1 :         case Opt_none:
     933             :                 /* Explicitly have no subsystems */
     934           1 :                 ctx->none = true;
     935           1 :                 break;
     936           0 :         case Opt_all:
     937           0 :                 ctx->all_ss = true;
     938           0 :                 break;
     939           0 :         case Opt_noprefix:
     940           0 :                 ctx->flags |= CGRP_ROOT_NOPREFIX;
     941           0 :                 break;
     942           0 :         case Opt_clone_children:
     943           0 :                 ctx->cpuset_clone_children = true;
     944           0 :                 break;
     945           0 :         case Opt_cpuset_v2_mode:
     946           0 :                 ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
     947           0 :                 break;
     948           1 :         case Opt_xattr:
     949           1 :                 ctx->flags |= CGRP_ROOT_XATTR;
     950           1 :                 break;
     951           0 :         case Opt_release_agent:
     952             :                 /* Specifying two release agents is forbidden */
     953           0 :                 if (ctx->release_agent)
     954           0 :                         return invalfc(fc, "release_agent respecified");
     955           0 :                 ctx->release_agent = param->string;
     956           0 :                 param->string = NULL;
     957           0 :                 break;
     958           1 :         case Opt_name:
     959             :                 /* blocked by boot param? */
     960           1 :                 if (cgroup_no_v1_named)
     961             :                         return -ENOENT;
     962             :                 /* Can't specify an empty name */
     963           1 :                 if (!param->size)
     964           0 :                         return invalfc(fc, "Empty name");
     965           1 :                 if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
     966           0 :                         return invalfc(fc, "Name too long");
     967             :                 /* Must match [\w.-]+ */
     968           8 :                 for (i = 0; i < param->size; i++) {
     969           7 :                         char c = param->string[i];
     970           7 :                         if (isalnum(c))
     971           7 :                                 continue;
     972           0 :                         if ((c == '.') || (c == '-') || (c == '_'))
     973           0 :                                 continue;
     974           0 :                         return invalfc(fc, "Invalid name");
     975             :                 }
     976             :                 /* Specifying two names is forbidden */
     977           1 :                 if (ctx->name)
     978           0 :                         return invalfc(fc, "name respecified");
     979           1 :                 ctx->name = param->string;
     980           1 :                 param->string = NULL;
     981           1 :                 break;
     982             :         }
     983             :         return 0;
     984             : }
     985             : 
     986           1 : static int check_cgroupfs_options(struct fs_context *fc)
     987             : {
     988           1 :         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
     989           1 :         u16 mask = U16_MAX;
     990           1 :         u16 enabled = 0;
     991           1 :         struct cgroup_subsys *ss;
     992           1 :         int i;
     993             : 
     994             : #ifdef CONFIG_CPUSETS
     995             :         mask = ~((u16)1 << cpuset_cgrp_id);
     996             : #endif
     997           1 :         for_each_subsys(ss, i)
     998             :                 if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
     999             :                         enabled |= 1 << i;
    1000             : 
    1001           1 :         ctx->subsys_mask &= enabled;
    1002             : 
    1003             :         /*
    1004             :          * In absense of 'none', 'name=' or subsystem name options,
    1005             :          * let's default to 'all'.
    1006             :          */
    1007           1 :         if (!ctx->subsys_mask && !ctx->none && !ctx->name)
    1008           0 :                 ctx->all_ss = true;
    1009             : 
    1010           1 :         if (ctx->all_ss) {
    1011             :                 /* Mutually exclusive option 'all' + subsystem name */
    1012             :                 if (ctx->subsys_mask)
    1013             :                         return invalfc(fc, "subsys name conflicts with all");
    1014             :                 /* 'all' => select all the subsystems */
    1015             :                 ctx->subsys_mask = enabled;
    1016             :         }
    1017             : 
    1018             :         /*
    1019             :          * We either have to specify by name or by subsystems. (So all
    1020             :          * empty hierarchies must have a name).
    1021             :          */
    1022           1 :         if (!ctx->subsys_mask && !ctx->name)
    1023           0 :                 return invalfc(fc, "Need name or subsystem set");
    1024             : 
    1025             :         /*
    1026             :          * Option noprefix was introduced just for backward compatibility
    1027             :          * with the old cpuset, so we allow noprefix only if mounting just
    1028             :          * the cpuset subsystem.
    1029             :          */
    1030           1 :         if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
    1031           0 :                 return invalfc(fc, "noprefix used incorrectly");
    1032             : 
    1033             :         /* Can't specify "none" and some subsystems */
    1034           1 :         if (ctx->subsys_mask && ctx->none)
    1035           0 :                 return invalfc(fc, "none used incorrectly");
    1036             : 
    1037             :         return 0;
    1038             : }
    1039             : 
    1040           0 : int cgroup1_reconfigure(struct fs_context *fc)
    1041             : {
    1042           0 :         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
    1043           0 :         struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
    1044           0 :         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
    1045           0 :         int ret = 0;
    1046           0 :         u16 added_mask, removed_mask;
    1047             : 
    1048           0 :         cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
    1049             : 
    1050             :         /* See what subsystems are wanted */
    1051           0 :         ret = check_cgroupfs_options(fc);
    1052           0 :         if (ret)
    1053           0 :                 goto out_unlock;
    1054             : 
    1055           0 :         if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
    1056           0 :                 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
    1057             :                         task_tgid_nr(current), current->comm);
    1058             : 
    1059           0 :         added_mask = ctx->subsys_mask & ~root->subsys_mask;
    1060           0 :         removed_mask = root->subsys_mask & ~ctx->subsys_mask;
    1061             : 
    1062             :         /* Don't allow flags or name to change at remount */
    1063           0 :         if ((ctx->flags ^ root->flags) ||
    1064           0 :             (ctx->name && strcmp(ctx->name, root->name))) {
    1065           0 :                 errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
    1066             :                        ctx->flags, ctx->name ?: "", root->flags, root->name);
    1067           0 :                 ret = -EINVAL;
    1068           0 :                 goto out_unlock;
    1069             :         }
    1070             : 
    1071             :         /* remounting is not allowed for populated hierarchies */
    1072           0 :         if (!list_empty(&root->cgrp.self.children)) {
    1073           0 :                 ret = -EBUSY;
    1074           0 :                 goto out_unlock;
    1075             :         }
    1076             : 
    1077           0 :         ret = rebind_subsystems(root, added_mask);
    1078           0 :         if (ret)
    1079           0 :                 goto out_unlock;
    1080             : 
    1081           0 :         WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
    1082             : 
    1083           0 :         if (ctx->release_agent) {
    1084           0 :                 spin_lock(&release_agent_path_lock);
    1085           0 :                 strcpy(root->release_agent_path, ctx->release_agent);
    1086           0 :                 spin_unlock(&release_agent_path_lock);
    1087             :         }
    1088             : 
    1089           0 :         trace_cgroup_remount(root);
    1090             : 
    1091           0 :  out_unlock:
    1092           0 :         mutex_unlock(&cgroup_mutex);
    1093           0 :         return ret;
    1094             : }
    1095             : 
    1096             : struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
    1097             :         .rename                 = cgroup1_rename,
    1098             :         .show_options           = cgroup1_show_options,
    1099             :         .mkdir                  = cgroup_mkdir,
    1100             :         .rmdir                  = cgroup_rmdir,
    1101             :         .show_path              = cgroup_show_path,
    1102             : };
    1103             : 
    1104             : /*
    1105             :  * The guts of cgroup1 mount - find or create cgroup_root to use.
    1106             :  * Called with cgroup_mutex held; returns 0 on success, -E... on
    1107             :  * error and positive - in case when the candidate is busy dying.
    1108             :  * On success it stashes a reference to cgroup_root into given
    1109             :  * cgroup_fs_context; that reference is *NOT* counting towards the
    1110             :  * cgroup_root refcount.
    1111             :  */
    1112           1 : static int cgroup1_root_to_use(struct fs_context *fc)
    1113             : {
    1114           1 :         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
    1115           1 :         struct cgroup_root *root;
    1116           1 :         struct cgroup_subsys *ss;
    1117           1 :         int i, ret;
    1118             : 
    1119             :         /* First find the desired set of subsystems */
    1120           1 :         ret = check_cgroupfs_options(fc);
    1121           1 :         if (ret)
    1122             :                 return ret;
    1123             : 
    1124             :         /*
    1125             :          * Destruction of cgroup root is asynchronous, so subsystems may
    1126             :          * still be dying after the previous unmount.  Let's drain the
    1127             :          * dying subsystems.  We just need to ensure that the ones
    1128             :          * unmounted previously finish dying and don't care about new ones
    1129             :          * starting.  Testing ref liveliness is good enough.
    1130             :          */
    1131           1 :         for_each_subsys(ss, i) {
    1132             :                 if (!(ctx->subsys_mask & (1 << i)) ||
    1133             :                     ss->root == &cgrp_dfl_root)
    1134             :                         continue;
    1135             : 
    1136             :                 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
    1137             :                         return 1;       /* restart */
    1138             :                 cgroup_put(&ss->root->cgrp);
    1139             :         }
    1140             : 
    1141           2 :         for_each_root(root) {
    1142           1 :                 bool name_match = false;
    1143             : 
    1144           1 :                 if (root == &cgrp_dfl_root)
    1145           1 :                         continue;
    1146             : 
    1147             :                 /*
    1148             :                  * If we asked for a name then it must match.  Also, if
    1149             :                  * name matches but sybsys_mask doesn't, we should fail.
    1150             :                  * Remember whether name matched.
    1151             :                  */
    1152           0 :                 if (ctx->name) {
    1153           0 :                         if (strcmp(ctx->name, root->name))
    1154           0 :                                 continue;
    1155             :                         name_match = true;
    1156             :                 }
    1157             : 
    1158             :                 /*
    1159             :                  * If we asked for subsystems (or explicitly for no
    1160             :                  * subsystems) then they must match.
    1161             :                  */
    1162           0 :                 if ((ctx->subsys_mask || ctx->none) &&
    1163           0 :                     (ctx->subsys_mask != root->subsys_mask)) {
    1164           0 :                         if (!name_match)
    1165           0 :                                 continue;
    1166             :                         return -EBUSY;
    1167             :                 }
    1168             : 
    1169           0 :                 if (root->flags ^ ctx->flags)
    1170           0 :                         pr_warn("new mount options do not match the existing superblock, will be ignored\n");
    1171             : 
    1172           0 :                 ctx->root = root;
    1173           0 :                 return 0;
    1174             :         }
    1175             : 
    1176             :         /*
    1177             :          * No such thing, create a new one.  name= matching without subsys
    1178             :          * specification is allowed for already existing hierarchies but we
    1179             :          * can't create new one without subsys specification.
    1180             :          */
    1181           1 :         if (!ctx->subsys_mask && !ctx->none)
    1182           0 :                 return invalfc(fc, "No subsys list or none specified");
    1183             : 
    1184             :         /* Hierarchies may only be created in the initial cgroup namespace. */
    1185           1 :         if (ctx->ns != &init_cgroup_ns)
    1186             :                 return -EPERM;
    1187             : 
    1188           1 :         root = kzalloc(sizeof(*root), GFP_KERNEL);
    1189           1 :         if (!root)
    1190             :                 return -ENOMEM;
    1191             : 
    1192           1 :         ctx->root = root;
    1193           1 :         init_cgroup_root(ctx);
    1194             : 
    1195           1 :         ret = cgroup_setup_root(root, ctx->subsys_mask);
    1196           1 :         if (ret)
    1197           0 :                 cgroup_free_root(root);
    1198             :         return ret;
    1199             : }
    1200             : 
    1201           1 : int cgroup1_get_tree(struct fs_context *fc)
    1202             : {
    1203           1 :         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
    1204           1 :         int ret;
    1205             : 
    1206             :         /* Check if the caller has permission to mount. */
    1207           1 :         if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
    1208             :                 return -EPERM;
    1209             : 
    1210           1 :         cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
    1211             : 
    1212           1 :         ret = cgroup1_root_to_use(fc);
    1213           1 :         if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
    1214           0 :                 ret = 1;        /* restart */
    1215             : 
    1216           1 :         mutex_unlock(&cgroup_mutex);
    1217             : 
    1218           1 :         if (!ret)
    1219           1 :                 ret = cgroup_do_get_tree(fc);
    1220             : 
    1221           1 :         if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
    1222           0 :                 struct super_block *sb = fc->root->d_sb;
    1223           0 :                 dput(fc->root);
    1224           0 :                 deactivate_locked_super(sb);
    1225           0 :                 ret = 1;
    1226             :         }
    1227             : 
    1228           1 :         if (unlikely(ret > 0)) {
    1229           0 :                 msleep(10);
    1230           0 :                 return restart_syscall();
    1231             :         }
    1232             :         return ret;
    1233             : }
    1234             : 
    1235           1 : static int __init cgroup1_wq_init(void)
    1236             : {
    1237             :         /*
    1238             :          * Used to destroy pidlists and separate to serve as flush domain.
    1239             :          * Cap @max_active to 1 too.
    1240             :          */
    1241           1 :         cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
    1242             :                                                     0, 1);
    1243           1 :         BUG_ON(!cgroup_pidlist_destroy_wq);
    1244           1 :         return 0;
    1245             : }
    1246             : core_initcall(cgroup1_wq_init);
    1247             : 
    1248           0 : static int __init cgroup_no_v1(char *str)
    1249             : {
    1250           0 :         struct cgroup_subsys *ss;
    1251           0 :         char *token;
    1252           0 :         int i;
    1253             : 
    1254           0 :         while ((token = strsep(&str, ",")) != NULL) {
    1255           0 :                 if (!*token)
    1256           0 :                         continue;
    1257             : 
    1258           0 :                 if (!strcmp(token, "all")) {
    1259           0 :                         cgroup_no_v1_mask = U16_MAX;
    1260           0 :                         continue;
    1261             :                 }
    1262             : 
    1263           0 :                 if (!strcmp(token, "named")) {
    1264           0 :                         cgroup_no_v1_named = true;
    1265           0 :                         continue;
    1266             :                 }
    1267             : 
    1268             :                 for_each_subsys(ss, i) {
    1269             :                         if (strcmp(token, ss->name) &&
    1270             :                             strcmp(token, ss->legacy_name))
    1271             :                                 continue;
    1272             : 
    1273             :                         cgroup_no_v1_mask |= 1 << i;
    1274             :                 }
    1275             :         }
    1276           0 :         return 1;
    1277             : }
    1278             : __setup("cgroup_no_v1=", cgroup_no_v1);

Generated by: LCOV version 1.14