LCOV - code coverage report
Current view: top level - mm - backing-dev.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 129 312 41.3 %
Date: 2021-04-22 12:43:58 Functions: 14 33 42.4 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : 
       3             : #include <linux/wait.h>
       4             : #include <linux/rbtree.h>
       5             : #include <linux/backing-dev.h>
       6             : #include <linux/kthread.h>
       7             : #include <linux/freezer.h>
       8             : #include <linux/fs.h>
       9             : #include <linux/pagemap.h>
      10             : #include <linux/mm.h>
      11             : #include <linux/sched/mm.h>
      12             : #include <linux/sched.h>
      13             : #include <linux/module.h>
      14             : #include <linux/writeback.h>
      15             : #include <linux/device.h>
      16             : #include <trace/events/writeback.h>
      17             : 
      18             : struct backing_dev_info noop_backing_dev_info;
      19             : EXPORT_SYMBOL_GPL(noop_backing_dev_info);
      20             : 
      21             : static struct class *bdi_class;
      22             : static const char *bdi_unknown_name = "(unknown)";
      23             : 
      24             : /*
      25             :  * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
      26             :  * reader side locking.
      27             :  */
      28             : DEFINE_SPINLOCK(bdi_lock);
      29             : static u64 bdi_id_cursor;
      30             : static struct rb_root bdi_tree = RB_ROOT;
      31             : LIST_HEAD(bdi_list);
      32             : 
      33             : /* bdi_wq serves all asynchronous writeback tasks */
      34             : struct workqueue_struct *bdi_wq;
      35             : 
      36             : #define K(x) ((x) << (PAGE_SHIFT - 10))
      37             : 
      38             : #ifdef CONFIG_DEBUG_FS
      39             : #include <linux/debugfs.h>
      40             : #include <linux/seq_file.h>
      41             : 
      42             : static struct dentry *bdi_debug_root;
      43             : 
      44           1 : static void bdi_debug_init(void)
      45             : {
      46           1 :         bdi_debug_root = debugfs_create_dir("bdi", NULL);
      47           1 : }
      48             : 
      49           0 : static int bdi_debug_stats_show(struct seq_file *m, void *v)
      50             : {
      51           0 :         struct backing_dev_info *bdi = m->private;
      52           0 :         struct bdi_writeback *wb = &bdi->wb;
      53           0 :         unsigned long background_thresh;
      54           0 :         unsigned long dirty_thresh;
      55           0 :         unsigned long wb_thresh;
      56           0 :         unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
      57           0 :         struct inode *inode;
      58             : 
      59           0 :         nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
      60           0 :         spin_lock(&wb->list_lock);
      61           0 :         list_for_each_entry(inode, &wb->b_dirty, i_io_list)
      62           0 :                 nr_dirty++;
      63           0 :         list_for_each_entry(inode, &wb->b_io, i_io_list)
      64           0 :                 nr_io++;
      65           0 :         list_for_each_entry(inode, &wb->b_more_io, i_io_list)
      66           0 :                 nr_more_io++;
      67           0 :         list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
      68           0 :                 if (inode->i_state & I_DIRTY_TIME)
      69           0 :                         nr_dirty_time++;
      70           0 :         spin_unlock(&wb->list_lock);
      71             : 
      72           0 :         global_dirty_limits(&background_thresh, &dirty_thresh);
      73           0 :         wb_thresh = wb_calc_thresh(wb, dirty_thresh);
      74             : 
      75           0 :         seq_printf(m,
      76             :                    "BdiWriteback:       %10lu kB\n"
      77             :                    "BdiReclaimable:     %10lu kB\n"
      78             :                    "BdiDirtyThresh:     %10lu kB\n"
      79             :                    "DirtyThresh:        %10lu kB\n"
      80             :                    "BackgroundThresh:   %10lu kB\n"
      81             :                    "BdiDirtied:         %10lu kB\n"
      82             :                    "BdiWritten:         %10lu kB\n"
      83             :                    "BdiWriteBandwidth:  %10lu kBps\n"
      84             :                    "b_dirty:            %10lu\n"
      85             :                    "b_io:               %10lu\n"
      86             :                    "b_more_io:          %10lu\n"
      87             :                    "b_dirty_time:       %10lu\n"
      88             :                    "bdi_list:           %10u\n"
      89             :                    "state:              %10lx\n",
      90           0 :                    (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
      91           0 :                    (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
      92             :                    K(wb_thresh),
      93             :                    K(dirty_thresh),
      94             :                    K(background_thresh),
      95           0 :                    (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
      96           0 :                    (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
      97           0 :                    (unsigned long) K(wb->write_bandwidth),
      98             :                    nr_dirty,
      99             :                    nr_io,
     100             :                    nr_more_io,
     101             :                    nr_dirty_time,
     102           0 :                    !list_empty(&bdi->bdi_list), bdi->wb.state);
     103             : 
     104           0 :         return 0;
     105             : }
     106           0 : DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
     107             : 
     108           9 : static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
     109             : {
     110           9 :         bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
     111             : 
     112           9 :         debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
     113             :                             &bdi_debug_stats_fops);
     114           9 : }
     115             : 
     116           0 : static void bdi_debug_unregister(struct backing_dev_info *bdi)
     117             : {
     118           0 :         debugfs_remove_recursive(bdi->debug_dir);
     119             : }
     120             : #else
     121             : static inline void bdi_debug_init(void)
     122             : {
     123             : }
     124             : static inline void bdi_debug_register(struct backing_dev_info *bdi,
     125             :                                       const char *name)
     126             : {
     127             : }
     128             : static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
     129             : {
     130             : }
     131             : #endif
     132             : 
     133           0 : static ssize_t read_ahead_kb_store(struct device *dev,
     134             :                                   struct device_attribute *attr,
     135             :                                   const char *buf, size_t count)
     136             : {
     137           0 :         struct backing_dev_info *bdi = dev_get_drvdata(dev);
     138           0 :         unsigned long read_ahead_kb;
     139           0 :         ssize_t ret;
     140             : 
     141           0 :         ret = kstrtoul(buf, 10, &read_ahead_kb);
     142           0 :         if (ret < 0)
     143             :                 return ret;
     144             : 
     145           0 :         bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
     146             : 
     147           0 :         return count;
     148             : }
     149             : 
     150             : #define BDI_SHOW(name, expr)                                            \
     151             : static ssize_t name##_show(struct device *dev,                          \
     152             :                            struct device_attribute *attr, char *buf)    \
     153             : {                                                                       \
     154             :         struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
     155             :                                                                         \
     156             :         return sysfs_emit(buf, "%lld\n", (long long)expr);            \
     157             : }                                                                       \
     158             : static DEVICE_ATTR_RW(name);
     159             : 
     160           0 : BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
     161             : 
     162           0 : static ssize_t min_ratio_store(struct device *dev,
     163             :                 struct device_attribute *attr, const char *buf, size_t count)
     164             : {
     165           0 :         struct backing_dev_info *bdi = dev_get_drvdata(dev);
     166           0 :         unsigned int ratio;
     167           0 :         ssize_t ret;
     168             : 
     169           0 :         ret = kstrtouint(buf, 10, &ratio);
     170           0 :         if (ret < 0)
     171             :                 return ret;
     172             : 
     173           0 :         ret = bdi_set_min_ratio(bdi, ratio);
     174           0 :         if (!ret)
     175           0 :                 ret = count;
     176             : 
     177             :         return ret;
     178             : }
     179           0 : BDI_SHOW(min_ratio, bdi->min_ratio)
     180             : 
     181           0 : static ssize_t max_ratio_store(struct device *dev,
     182             :                 struct device_attribute *attr, const char *buf, size_t count)
     183             : {
     184           0 :         struct backing_dev_info *bdi = dev_get_drvdata(dev);
     185           0 :         unsigned int ratio;
     186           0 :         ssize_t ret;
     187             : 
     188           0 :         ret = kstrtouint(buf, 10, &ratio);
     189           0 :         if (ret < 0)
     190             :                 return ret;
     191             : 
     192           0 :         ret = bdi_set_max_ratio(bdi, ratio);
     193           0 :         if (!ret)
     194           0 :                 ret = count;
     195             : 
     196             :         return ret;
     197             : }
     198           0 : BDI_SHOW(max_ratio, bdi->max_ratio)
     199             : 
     200           0 : static ssize_t stable_pages_required_show(struct device *dev,
     201             :                                           struct device_attribute *attr,
     202             :                                           char *buf)
     203             : {
     204           0 :         dev_warn_once(dev,
     205             :                 "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
     206           0 :         return sysfs_emit(buf, "%d\n", 0);
     207             : }
     208             : static DEVICE_ATTR_RO(stable_pages_required);
     209             : 
     210             : static struct attribute *bdi_dev_attrs[] = {
     211             :         &dev_attr_read_ahead_kb.attr,
     212             :         &dev_attr_min_ratio.attr,
     213             :         &dev_attr_max_ratio.attr,
     214             :         &dev_attr_stable_pages_required.attr,
     215             :         NULL,
     216             : };
     217             : ATTRIBUTE_GROUPS(bdi_dev);
     218             : 
     219           1 : static __init int bdi_class_init(void)
     220             : {
     221           1 :         bdi_class = class_create(THIS_MODULE, "bdi");
     222           1 :         if (IS_ERR(bdi_class))
     223           0 :                 return PTR_ERR(bdi_class);
     224             : 
     225           1 :         bdi_class->dev_groups = bdi_dev_groups;
     226           1 :         bdi_debug_init();
     227             : 
     228           1 :         return 0;
     229             : }
     230             : postcore_initcall(bdi_class_init);
     231             : 
     232             : static int bdi_init(struct backing_dev_info *bdi);
     233             : 
     234           1 : static int __init default_bdi_init(void)
     235             : {
     236           1 :         int err;
     237             : 
     238           1 :         bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
     239             :                                  WQ_SYSFS, 0);
     240           1 :         if (!bdi_wq)
     241             :                 return -ENOMEM;
     242             : 
     243           1 :         err = bdi_init(&noop_backing_dev_info);
     244             : 
     245           1 :         return err;
     246             : }
     247             : subsys_initcall(default_bdi_init);
     248             : 
     249             : /*
     250             :  * This function is used when the first inode for this wb is marked dirty. It
     251             :  * wakes-up the corresponding bdi thread which should then take care of the
     252             :  * periodic background write-out of dirty inodes. Since the write-out would
     253             :  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
     254             :  * set up a timer which wakes the bdi thread up later.
     255             :  *
     256             :  * Note, we wouldn't bother setting up the timer, but this function is on the
     257             :  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
     258             :  * by delaying the wake-up.
     259             :  *
     260             :  * We have to be careful not to postpone flush work if it is scheduled for
     261             :  * earlier. Thus we use queue_delayed_work().
     262             :  */
     263           8 : void wb_wakeup_delayed(struct bdi_writeback *wb)
     264             : {
     265           8 :         unsigned long timeout;
     266             : 
     267           8 :         timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
     268           8 :         spin_lock_bh(&wb->work_lock);
     269           8 :         if (test_bit(WB_registered, &wb->state))
     270           8 :                 queue_delayed_work(bdi_wq, &wb->dwork, timeout);
     271           8 :         spin_unlock_bh(&wb->work_lock);
     272           8 : }
     273             : 
     274             : /*
     275             :  * Initial write bandwidth: 100 MB/s
     276             :  */
     277             : #define INIT_BW         (100 << (20 - PAGE_SHIFT))
     278             : 
     279          10 : static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
     280             :                    gfp_t gfp)
     281             : {
     282          10 :         int i, err;
     283             : 
     284          10 :         memset(wb, 0, sizeof(*wb));
     285             : 
     286          10 :         if (wb != &bdi->wb)
     287           0 :                 bdi_get(bdi);
     288          10 :         wb->bdi = bdi;
     289          10 :         wb->last_old_flush = jiffies;
     290          10 :         INIT_LIST_HEAD(&wb->b_dirty);
     291          10 :         INIT_LIST_HEAD(&wb->b_io);
     292          10 :         INIT_LIST_HEAD(&wb->b_more_io);
     293          10 :         INIT_LIST_HEAD(&wb->b_dirty_time);
     294          10 :         spin_lock_init(&wb->list_lock);
     295             : 
     296          10 :         wb->bw_time_stamp = jiffies;
     297          10 :         wb->balanced_dirty_ratelimit = INIT_BW;
     298          10 :         wb->dirty_ratelimit = INIT_BW;
     299          10 :         wb->write_bandwidth = INIT_BW;
     300          10 :         wb->avg_write_bandwidth = INIT_BW;
     301             : 
     302          10 :         spin_lock_init(&wb->work_lock);
     303          10 :         INIT_LIST_HEAD(&wb->work_list);
     304          10 :         INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
     305          10 :         wb->dirty_sleep = jiffies;
     306             : 
     307          10 :         err = fprop_local_init_percpu(&wb->completions, gfp);
     308          10 :         if (err)
     309           0 :                 goto out_put_bdi;
     310             : 
     311          50 :         for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
     312          40 :                 err = percpu_counter_init(&wb->stat[i], 0, gfp);
     313          40 :                 if (err)
     314           0 :                         goto out_destroy_stat;
     315             :         }
     316             : 
     317             :         return 0;
     318             : 
     319           0 : out_destroy_stat:
     320           0 :         while (i--)
     321           0 :                 percpu_counter_destroy(&wb->stat[i]);
     322           0 :         fprop_local_destroy_percpu(&wb->completions);
     323           0 : out_put_bdi:
     324           0 :         if (wb != &bdi->wb)
     325          10 :                 bdi_put(bdi);
     326             :         return err;
     327             : }
     328             : 
     329             : static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
     330             : 
     331             : /*
     332             :  * Remove bdi from the global list and shutdown any threads we have running
     333             :  */
     334           0 : static void wb_shutdown(struct bdi_writeback *wb)
     335             : {
     336             :         /* Make sure nobody queues further work */
     337           0 :         spin_lock_bh(&wb->work_lock);
     338           0 :         if (!test_and_clear_bit(WB_registered, &wb->state)) {
     339           0 :                 spin_unlock_bh(&wb->work_lock);
     340           0 :                 return;
     341             :         }
     342           0 :         spin_unlock_bh(&wb->work_lock);
     343             : 
     344           0 :         cgwb_remove_from_bdi_list(wb);
     345             :         /*
     346             :          * Drain work list and shutdown the delayed_work.  !WB_registered
     347             :          * tells wb_workfn() that @wb is dying and its work_list needs to
     348             :          * be drained no matter what.
     349             :          */
     350           0 :         mod_delayed_work(bdi_wq, &wb->dwork, 0);
     351           0 :         flush_delayed_work(&wb->dwork);
     352           0 :         WARN_ON(!list_empty(&wb->work_list));
     353             : }
     354             : 
     355           0 : static void wb_exit(struct bdi_writeback *wb)
     356             : {
     357           0 :         int i;
     358             : 
     359           0 :         WARN_ON(delayed_work_pending(&wb->dwork));
     360             : 
     361           0 :         for (i = 0; i < NR_WB_STAT_ITEMS; i++)
     362           0 :                 percpu_counter_destroy(&wb->stat[i]);
     363             : 
     364           0 :         fprop_local_destroy_percpu(&wb->completions);
     365           0 :         if (wb != &wb->bdi->wb)
     366           0 :                 bdi_put(wb->bdi);
     367           0 : }
     368             : 
     369             : #ifdef CONFIG_CGROUP_WRITEBACK
     370             : 
     371             : #include <linux/memcontrol.h>
     372             : 
     373             : /*
     374             :  * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
     375             :  * bdi->cgwb_tree is also RCU protected.
     376             :  */
     377             : static DEFINE_SPINLOCK(cgwb_lock);
     378             : static struct workqueue_struct *cgwb_release_wq;
     379             : 
     380             : static void cgwb_release_workfn(struct work_struct *work)
     381             : {
     382             :         struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
     383             :                                                 release_work);
     384             :         struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
     385             : 
     386             :         mutex_lock(&wb->bdi->cgwb_release_mutex);
     387             :         wb_shutdown(wb);
     388             : 
     389             :         css_put(wb->memcg_css);
     390             :         css_put(wb->blkcg_css);
     391             :         mutex_unlock(&wb->bdi->cgwb_release_mutex);
     392             : 
     393             :         /* triggers blkg destruction if no online users left */
     394             :         blkcg_unpin_online(blkcg);
     395             : 
     396             :         fprop_local_destroy_percpu(&wb->memcg_completions);
     397             :         percpu_ref_exit(&wb->refcnt);
     398             :         wb_exit(wb);
     399             :         kfree_rcu(wb, rcu);
     400             : }
     401             : 
     402             : static void cgwb_release(struct percpu_ref *refcnt)
     403             : {
     404             :         struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
     405             :                                                 refcnt);
     406             :         queue_work(cgwb_release_wq, &wb->release_work);
     407             : }
     408             : 
     409             : static void cgwb_kill(struct bdi_writeback *wb)
     410             : {
     411             :         lockdep_assert_held(&cgwb_lock);
     412             : 
     413             :         WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
     414             :         list_del(&wb->memcg_node);
     415             :         list_del(&wb->blkcg_node);
     416             :         percpu_ref_kill(&wb->refcnt);
     417             : }
     418             : 
     419             : static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
     420             : {
     421             :         spin_lock_irq(&cgwb_lock);
     422             :         list_del_rcu(&wb->bdi_node);
     423             :         spin_unlock_irq(&cgwb_lock);
     424             : }
     425             : 
     426             : static int cgwb_create(struct backing_dev_info *bdi,
     427             :                        struct cgroup_subsys_state *memcg_css, gfp_t gfp)
     428             : {
     429             :         struct mem_cgroup *memcg;
     430             :         struct cgroup_subsys_state *blkcg_css;
     431             :         struct blkcg *blkcg;
     432             :         struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
     433             :         struct bdi_writeback *wb;
     434             :         unsigned long flags;
     435             :         int ret = 0;
     436             : 
     437             :         memcg = mem_cgroup_from_css(memcg_css);
     438             :         blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
     439             :         blkcg = css_to_blkcg(blkcg_css);
     440             :         memcg_cgwb_list = &memcg->cgwb_list;
     441             :         blkcg_cgwb_list = &blkcg->cgwb_list;
     442             : 
     443             :         /* look up again under lock and discard on blkcg mismatch */
     444             :         spin_lock_irqsave(&cgwb_lock, flags);
     445             :         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
     446             :         if (wb && wb->blkcg_css != blkcg_css) {
     447             :                 cgwb_kill(wb);
     448             :                 wb = NULL;
     449             :         }
     450             :         spin_unlock_irqrestore(&cgwb_lock, flags);
     451             :         if (wb)
     452             :                 goto out_put;
     453             : 
     454             :         /* need to create a new one */
     455             :         wb = kmalloc(sizeof(*wb), gfp);
     456             :         if (!wb) {
     457             :                 ret = -ENOMEM;
     458             :                 goto out_put;
     459             :         }
     460             : 
     461             :         ret = wb_init(wb, bdi, gfp);
     462             :         if (ret)
     463             :                 goto err_free;
     464             : 
     465             :         ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
     466             :         if (ret)
     467             :                 goto err_wb_exit;
     468             : 
     469             :         ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
     470             :         if (ret)
     471             :                 goto err_ref_exit;
     472             : 
     473             :         wb->memcg_css = memcg_css;
     474             :         wb->blkcg_css = blkcg_css;
     475             :         INIT_WORK(&wb->release_work, cgwb_release_workfn);
     476             :         set_bit(WB_registered, &wb->state);
     477             : 
     478             :         /*
     479             :          * The root wb determines the registered state of the whole bdi and
     480             :          * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
     481             :          * whether they're still online.  Don't link @wb if any is dead.
     482             :          * See wb_memcg_offline() and wb_blkcg_offline().
     483             :          */
     484             :         ret = -ENODEV;
     485             :         spin_lock_irqsave(&cgwb_lock, flags);
     486             :         if (test_bit(WB_registered, &bdi->wb.state) &&
     487             :             blkcg_cgwb_list->next && memcg_cgwb_list->next) {
     488             :                 /* we might have raced another instance of this function */
     489             :                 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
     490             :                 if (!ret) {
     491             :                         list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
     492             :                         list_add(&wb->memcg_node, memcg_cgwb_list);
     493             :                         list_add(&wb->blkcg_node, blkcg_cgwb_list);
     494             :                         blkcg_pin_online(blkcg);
     495             :                         css_get(memcg_css);
     496             :                         css_get(blkcg_css);
     497             :                 }
     498             :         }
     499             :         spin_unlock_irqrestore(&cgwb_lock, flags);
     500             :         if (ret) {
     501             :                 if (ret == -EEXIST)
     502             :                         ret = 0;
     503             :                 goto err_fprop_exit;
     504             :         }
     505             :         goto out_put;
     506             : 
     507             : err_fprop_exit:
     508             :         fprop_local_destroy_percpu(&wb->memcg_completions);
     509             : err_ref_exit:
     510             :         percpu_ref_exit(&wb->refcnt);
     511             : err_wb_exit:
     512             :         wb_exit(wb);
     513             : err_free:
     514             :         kfree(wb);
     515             : out_put:
     516             :         css_put(blkcg_css);
     517             :         return ret;
     518             : }
     519             : 
     520             : /**
     521             :  * wb_get_lookup - get wb for a given memcg
     522             :  * @bdi: target bdi
     523             :  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
     524             :  *
     525             :  * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
     526             :  * refcount incremented.
     527             :  *
     528             :  * This function uses css_get() on @memcg_css and thus expects its refcnt
     529             :  * to be positive on invocation.  IOW, rcu_read_lock() protection on
     530             :  * @memcg_css isn't enough.  try_get it before calling this function.
     531             :  *
     532             :  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
     533             :  * memcg on the default hierarchy, memcg association is guaranteed to be
     534             :  * more specific (equal or descendant to the associated blkcg) and thus can
     535             :  * identify both the memcg and blkcg associations.
     536             :  *
     537             :  * Because the blkcg associated with a memcg may change as blkcg is enabled
     538             :  * and disabled closer to root in the hierarchy, each wb keeps track of
     539             :  * both the memcg and blkcg associated with it and verifies the blkcg on
     540             :  * each lookup.  On mismatch, the existing wb is discarded and a new one is
     541             :  * created.
     542             :  */
     543             : struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
     544             :                                     struct cgroup_subsys_state *memcg_css)
     545             : {
     546             :         struct bdi_writeback *wb;
     547             : 
     548             :         if (!memcg_css->parent)
     549             :                 return &bdi->wb;
     550             : 
     551             :         rcu_read_lock();
     552             :         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
     553             :         if (wb) {
     554             :                 struct cgroup_subsys_state *blkcg_css;
     555             : 
     556             :                 /* see whether the blkcg association has changed */
     557             :                 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
     558             :                 if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
     559             :                         wb = NULL;
     560             :                 css_put(blkcg_css);
     561             :         }
     562             :         rcu_read_unlock();
     563             : 
     564             :         return wb;
     565             : }
     566             : 
     567             : /**
     568             :  * wb_get_create - get wb for a given memcg, create if necessary
     569             :  * @bdi: target bdi
     570             :  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
     571             :  * @gfp: allocation mask to use
     572             :  *
     573             :  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
     574             :  * create one.  See wb_get_lookup() for more details.
     575             :  */
     576             : struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
     577             :                                     struct cgroup_subsys_state *memcg_css,
     578             :                                     gfp_t gfp)
     579             : {
     580             :         struct bdi_writeback *wb;
     581             : 
     582             :         might_alloc(gfp);
     583             : 
     584             :         if (!memcg_css->parent)
     585             :                 return &bdi->wb;
     586             : 
     587             :         do {
     588             :                 wb = wb_get_lookup(bdi, memcg_css);
     589             :         } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
     590             : 
     591             :         return wb;
     592             : }
     593             : 
     594             : static int cgwb_bdi_init(struct backing_dev_info *bdi)
     595             : {
     596             :         int ret;
     597             : 
     598             :         INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
     599             :         mutex_init(&bdi->cgwb_release_mutex);
     600             :         init_rwsem(&bdi->wb_switch_rwsem);
     601             : 
     602             :         ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
     603             :         if (!ret) {
     604             :                 bdi->wb.memcg_css = &root_mem_cgroup->css;
     605             :                 bdi->wb.blkcg_css = blkcg_root_css;
     606             :         }
     607             :         return ret;
     608             : }
     609             : 
     610             : static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
     611             : {
     612             :         struct radix_tree_iter iter;
     613             :         void **slot;
     614             :         struct bdi_writeback *wb;
     615             : 
     616             :         WARN_ON(test_bit(WB_registered, &bdi->wb.state));
     617             : 
     618             :         spin_lock_irq(&cgwb_lock);
     619             :         radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
     620             :                 cgwb_kill(*slot);
     621             :         spin_unlock_irq(&cgwb_lock);
     622             : 
     623             :         mutex_lock(&bdi->cgwb_release_mutex);
     624             :         spin_lock_irq(&cgwb_lock);
     625             :         while (!list_empty(&bdi->wb_list)) {
     626             :                 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
     627             :                                       bdi_node);
     628             :                 spin_unlock_irq(&cgwb_lock);
     629             :                 wb_shutdown(wb);
     630             :                 spin_lock_irq(&cgwb_lock);
     631             :         }
     632             :         spin_unlock_irq(&cgwb_lock);
     633             :         mutex_unlock(&bdi->cgwb_release_mutex);
     634             : }
     635             : 
     636             : /**
     637             :  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
     638             :  * @memcg: memcg being offlined
     639             :  *
     640             :  * Also prevents creation of any new wb's associated with @memcg.
     641             :  */
     642             : void wb_memcg_offline(struct mem_cgroup *memcg)
     643             : {
     644             :         struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
     645             :         struct bdi_writeback *wb, *next;
     646             : 
     647             :         spin_lock_irq(&cgwb_lock);
     648             :         list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
     649             :                 cgwb_kill(wb);
     650             :         memcg_cgwb_list->next = NULL;        /* prevent new wb's */
     651             :         spin_unlock_irq(&cgwb_lock);
     652             : }
     653             : 
     654             : /**
     655             :  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
     656             :  * @blkcg: blkcg being offlined
     657             :  *
     658             :  * Also prevents creation of any new wb's associated with @blkcg.
     659             :  */
     660             : void wb_blkcg_offline(struct blkcg *blkcg)
     661             : {
     662             :         struct bdi_writeback *wb, *next;
     663             : 
     664             :         spin_lock_irq(&cgwb_lock);
     665             :         list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
     666             :                 cgwb_kill(wb);
     667             :         blkcg->cgwb_list.next = NULL;        /* prevent new wb's */
     668             :         spin_unlock_irq(&cgwb_lock);
     669             : }
     670             : 
     671             : static void cgwb_bdi_register(struct backing_dev_info *bdi)
     672             : {
     673             :         spin_lock_irq(&cgwb_lock);
     674             :         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
     675             :         spin_unlock_irq(&cgwb_lock);
     676             : }
     677             : 
     678             : static int __init cgwb_init(void)
     679             : {
     680             :         /*
     681             :          * There can be many concurrent release work items overwhelming
     682             :          * system_wq.  Put them in a separate wq and limit concurrency.
     683             :          * There's no point in executing many of these in parallel.
     684             :          */
     685             :         cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
     686             :         if (!cgwb_release_wq)
     687             :                 return -ENOMEM;
     688             : 
     689             :         return 0;
     690             : }
     691             : subsys_initcall(cgwb_init);
     692             : 
     693             : #else   /* CONFIG_CGROUP_WRITEBACK */
     694             : 
     695          10 : static int cgwb_bdi_init(struct backing_dev_info *bdi)
     696             : {
     697          10 :         return wb_init(&bdi->wb, bdi, GFP_KERNEL);
     698             : }
     699             : 
     700           0 : static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
     701             : 
     702           9 : static void cgwb_bdi_register(struct backing_dev_info *bdi)
     703             : {
     704          18 :         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
     705             : }
     706             : 
     707           0 : static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
     708             : {
     709           0 :         list_del_rcu(&wb->bdi_node);
     710             : }
     711             : 
     712             : #endif  /* CONFIG_CGROUP_WRITEBACK */
     713             : 
     714          10 : static int bdi_init(struct backing_dev_info *bdi)
     715             : {
     716          10 :         int ret;
     717             : 
     718          10 :         bdi->dev = NULL;
     719             : 
     720          10 :         kref_init(&bdi->refcnt);
     721          10 :         bdi->min_ratio = 0;
     722          10 :         bdi->max_ratio = 100;
     723          10 :         bdi->max_prop_frac = FPROP_FRAC_BASE;
     724          10 :         INIT_LIST_HEAD(&bdi->bdi_list);
     725          10 :         INIT_LIST_HEAD(&bdi->wb_list);
     726          10 :         init_waitqueue_head(&bdi->wb_waitq);
     727             : 
     728          10 :         ret = cgwb_bdi_init(bdi);
     729             : 
     730          10 :         return ret;
     731             : }
     732             : 
     733           9 : struct backing_dev_info *bdi_alloc(int node_id)
     734             : {
     735           9 :         struct backing_dev_info *bdi;
     736             : 
     737           9 :         bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
     738           9 :         if (!bdi)
     739             :                 return NULL;
     740             : 
     741           9 :         if (bdi_init(bdi)) {
     742           0 :                 kfree(bdi);
     743           0 :                 return NULL;
     744             :         }
     745           9 :         bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
     746           9 :         bdi->ra_pages = VM_READAHEAD_PAGES;
     747           9 :         bdi->io_pages = VM_READAHEAD_PAGES;
     748           9 :         return bdi;
     749             : }
     750             : EXPORT_SYMBOL(bdi_alloc);
     751             : 
     752           9 : static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
     753             : {
     754           9 :         struct rb_node **p = &bdi_tree.rb_node;
     755           9 :         struct rb_node *parent = NULL;
     756           9 :         struct backing_dev_info *bdi;
     757             : 
     758          27 :         lockdep_assert_held(&bdi_lock);
     759             : 
     760          32 :         while (*p) {
     761          23 :                 parent = *p;
     762          23 :                 bdi = rb_entry(parent, struct backing_dev_info, rb_node);
     763             : 
     764          23 :                 if (bdi->id > id)
     765           0 :                         p = &(*p)->rb_left;
     766          23 :                 else if (bdi->id < id)
     767          23 :                         p = &(*p)->rb_right;
     768             :                 else
     769             :                         break;
     770             :         }
     771             : 
     772           9 :         if (parentp)
     773           9 :                 *parentp = parent;
     774           9 :         return p;
     775             : }
     776             : 
     777             : /**
     778             :  * bdi_get_by_id - lookup and get bdi from its id
     779             :  * @id: bdi id to lookup
     780             :  *
     781             :  * Find bdi matching @id and get it.  Returns NULL if the matching bdi
     782             :  * doesn't exist or is already unregistered.
     783             :  */
     784           0 : struct backing_dev_info *bdi_get_by_id(u64 id)
     785             : {
     786           0 :         struct backing_dev_info *bdi = NULL;
     787           0 :         struct rb_node **p;
     788             : 
     789           0 :         spin_lock_bh(&bdi_lock);
     790           0 :         p = bdi_lookup_rb_node(id, NULL);
     791           0 :         if (*p) {
     792           0 :                 bdi = rb_entry(*p, struct backing_dev_info, rb_node);
     793           0 :                 bdi_get(bdi);
     794             :         }
     795           0 :         spin_unlock_bh(&bdi_lock);
     796             : 
     797           0 :         return bdi;
     798             : }
     799             : 
     800           9 : int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
     801             : {
     802           9 :         struct device *dev;
     803           9 :         struct rb_node *parent, **p;
     804             : 
     805           9 :         if (bdi->dev)        /* The driver needs to use separate queues per device */
     806             :                 return 0;
     807             : 
     808           9 :         vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
     809           9 :         dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
     810           9 :         if (IS_ERR(dev))
     811           0 :                 return PTR_ERR(dev);
     812             : 
     813           9 :         cgwb_bdi_register(bdi);
     814           9 :         bdi->dev = dev;
     815             : 
     816          18 :         bdi_debug_register(bdi, dev_name(dev));
     817           9 :         set_bit(WB_registered, &bdi->wb.state);
     818             : 
     819           9 :         spin_lock_bh(&bdi_lock);
     820             : 
     821           9 :         bdi->id = ++bdi_id_cursor;
     822             : 
     823           9 :         p = bdi_lookup_rb_node(bdi->id, &parent);
     824           9 :         rb_link_node(&bdi->rb_node, parent, p);
     825           9 :         rb_insert_color(&bdi->rb_node, &bdi_tree);
     826             : 
     827           9 :         list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
     828             : 
     829           9 :         spin_unlock_bh(&bdi_lock);
     830             : 
     831           9 :         trace_writeback_bdi_register(bdi);
     832           9 :         return 0;
     833             : }
     834             : 
     835           9 : int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
     836             : {
     837           9 :         va_list args;
     838           9 :         int ret;
     839             : 
     840           9 :         va_start(args, fmt);
     841           9 :         ret = bdi_register_va(bdi, fmt, args);
     842           9 :         va_end(args);
     843           9 :         return ret;
     844             : }
     845             : EXPORT_SYMBOL(bdi_register);
     846             : 
     847           9 : void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
     848             : {
     849           9 :         WARN_ON_ONCE(bdi->owner);
     850           9 :         bdi->owner = owner;
     851           9 :         get_device(owner);
     852           9 : }
     853             : 
     854             : /*
     855             :  * Remove bdi from bdi_list, and ensure that it is no longer visible
     856             :  */
     857           0 : static void bdi_remove_from_list(struct backing_dev_info *bdi)
     858             : {
     859           0 :         spin_lock_bh(&bdi_lock);
     860           0 :         rb_erase(&bdi->rb_node, &bdi_tree);
     861           0 :         list_del_rcu(&bdi->bdi_list);
     862           0 :         spin_unlock_bh(&bdi_lock);
     863             : 
     864           0 :         synchronize_rcu_expedited();
     865           0 : }
     866             : 
     867           0 : void bdi_unregister(struct backing_dev_info *bdi)
     868             : {
     869             :         /* make sure nobody finds us on the bdi_list anymore */
     870           0 :         bdi_remove_from_list(bdi);
     871           0 :         wb_shutdown(&bdi->wb);
     872           0 :         cgwb_bdi_unregister(bdi);
     873             : 
     874           0 :         if (bdi->dev) {
     875           0 :                 bdi_debug_unregister(bdi);
     876           0 :                 device_unregister(bdi->dev);
     877           0 :                 bdi->dev = NULL;
     878             :         }
     879             : 
     880           0 :         if (bdi->owner) {
     881           0 :                 put_device(bdi->owner);
     882           0 :                 bdi->owner = NULL;
     883             :         }
     884           0 : }
     885             : 
     886           0 : static void release_bdi(struct kref *ref)
     887             : {
     888           0 :         struct backing_dev_info *bdi =
     889           0 :                         container_of(ref, struct backing_dev_info, refcnt);
     890             : 
     891           0 :         if (test_bit(WB_registered, &bdi->wb.state))
     892           0 :                 bdi_unregister(bdi);
     893           0 :         WARN_ON_ONCE(bdi->dev);
     894           0 :         wb_exit(&bdi->wb);
     895           0 :         kfree(bdi);
     896           0 : }
     897             : 
     898           1 : void bdi_put(struct backing_dev_info *bdi)
     899             : {
     900           1 :         kref_put(&bdi->refcnt, release_bdi);
     901           0 : }
     902             : EXPORT_SYMBOL(bdi_put);
     903             : 
     904           7 : const char *bdi_dev_name(struct backing_dev_info *bdi)
     905             : {
     906           7 :         if (!bdi || !bdi->dev)
     907           0 :                 return bdi_unknown_name;
     908           7 :         return bdi->dev_name;
     909             : }
     910             : EXPORT_SYMBOL_GPL(bdi_dev_name);
     911             : 
     912             : static wait_queue_head_t congestion_wqh[2] = {
     913             :                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
     914             :                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
     915             :         };
     916             : static atomic_t nr_wb_congested[2];
     917             : 
     918           0 : void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
     919             : {
     920           0 :         wait_queue_head_t *wqh = &congestion_wqh[sync];
     921           0 :         enum wb_congested_state bit;
     922             : 
     923           0 :         bit = sync ? WB_sync_congested : WB_async_congested;
     924           0 :         if (test_and_clear_bit(bit, &bdi->wb.congested))
     925           0 :                 atomic_dec(&nr_wb_congested[sync]);
     926           0 :         smp_mb__after_atomic();
     927           0 :         if (waitqueue_active(wqh))
     928           0 :                 wake_up(wqh);
     929           0 : }
     930             : EXPORT_SYMBOL(clear_bdi_congested);
     931             : 
     932           0 : void set_bdi_congested(struct backing_dev_info *bdi, int sync)
     933             : {
     934           0 :         enum wb_congested_state bit;
     935             : 
     936           0 :         bit = sync ? WB_sync_congested : WB_async_congested;
     937           0 :         if (!test_and_set_bit(bit, &bdi->wb.congested))
     938           0 :                 atomic_inc(&nr_wb_congested[sync]);
     939           0 : }
     940             : EXPORT_SYMBOL(set_bdi_congested);
     941             : 
     942             : /**
     943             :  * congestion_wait - wait for a backing_dev to become uncongested
     944             :  * @sync: SYNC or ASYNC IO
     945             :  * @timeout: timeout in jiffies
     946             :  *
     947             :  * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
     948             :  * write congestion.  If no backing_devs are congested then just wait for the
     949             :  * next write to be completed.
     950             :  */
     951           0 : long congestion_wait(int sync, long timeout)
     952             : {
     953           0 :         long ret;
     954           0 :         unsigned long start = jiffies;
     955           0 :         DEFINE_WAIT(wait);
     956           0 :         wait_queue_head_t *wqh = &congestion_wqh[sync];
     957             : 
     958           0 :         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
     959           0 :         ret = io_schedule_timeout(timeout);
     960           0 :         finish_wait(wqh, &wait);
     961             : 
     962           0 :         trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
     963             :                                         jiffies_to_usecs(jiffies - start));
     964             : 
     965           0 :         return ret;
     966             : }
     967             : EXPORT_SYMBOL(congestion_wait);
     968             : 
     969             : /**
     970             :  * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
     971             :  * @sync: SYNC or ASYNC IO
     972             :  * @timeout: timeout in jiffies
     973             :  *
     974             :  * In the event of a congested backing_dev (any backing_dev) this waits
     975             :  * for up to @timeout jiffies for either a BDI to exit congestion of the
     976             :  * given @sync queue or a write to complete.
     977             :  *
     978             :  * The return value is 0 if the sleep is for the full timeout. Otherwise,
     979             :  * it is the number of jiffies that were still remaining when the function
     980             :  * returned. return_value == timeout implies the function did not sleep.
     981             :  */
     982           0 : long wait_iff_congested(int sync, long timeout)
     983             : {
     984           0 :         long ret;
     985           0 :         unsigned long start = jiffies;
     986           0 :         DEFINE_WAIT(wait);
     987           0 :         wait_queue_head_t *wqh = &congestion_wqh[sync];
     988             : 
     989             :         /*
     990             :          * If there is no congestion, yield if necessary instead
     991             :          * of sleeping on the congestion queue
     992             :          */
     993           0 :         if (atomic_read(&nr_wb_congested[sync]) == 0) {
     994           0 :                 cond_resched();
     995             : 
     996             :                 /* In case we scheduled, work out time remaining */
     997           0 :                 ret = timeout - (jiffies - start);
     998           0 :                 if (ret < 0)
     999             :                         ret = 0;
    1000             : 
    1001           0 :                 goto out;
    1002             :         }
    1003             : 
    1004             :         /* Sleep until uncongested or a write happens */
    1005           0 :         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
    1006           0 :         ret = io_schedule_timeout(timeout);
    1007           0 :         finish_wait(wqh, &wait);
    1008             : 
    1009           0 : out:
    1010           0 :         trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
    1011             :                                         jiffies_to_usecs(jiffies - start));
    1012             : 
    1013           0 :         return ret;
    1014             : }
    1015             : EXPORT_SYMBOL(wait_iff_congested);

Generated by: LCOV version 1.14