LCOV - code coverage report
Current view: top level - fs/ext4 - mballoc.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 1691 2801 60.4 %
Date: 2021-04-22 12:43:58 Functions: 74 99 74.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
       4             :  * Written by Alex Tomas <alex@clusterfs.com>
       5             :  */
       6             : 
       7             : 
       8             : /*
       9             :  * mballoc.c contains the multiblocks allocation routines
      10             :  */
      11             : 
      12             : #include "ext4_jbd2.h"
      13             : #include "mballoc.h"
      14             : #include <linux/log2.h>
      15             : #include <linux/module.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/nospec.h>
      18             : #include <linux/backing-dev.h>
      19             : #include <trace/events/ext4.h>
      20             : 
      21             : /*
      22             :  * MUSTDO:
      23             :  *   - test ext4_ext_search_left() and ext4_ext_search_right()
      24             :  *   - search for metadata in few groups
      25             :  *
      26             :  * TODO v4:
      27             :  *   - normalization should take into account whether file is still open
      28             :  *   - discard preallocations if no free space left (policy?)
      29             :  *   - don't normalize tails
      30             :  *   - quota
      31             :  *   - reservation for superuser
      32             :  *
      33             :  * TODO v3:
      34             :  *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
      35             :  *   - track min/max extents in each group for better group selection
      36             :  *   - mb_mark_used() may allocate chunk right after splitting buddy
      37             :  *   - tree of groups sorted by number of free blocks
      38             :  *   - error handling
      39             :  */
      40             : 
      41             : /*
      42             :  * The allocation request involve request for multiple number of blocks
      43             :  * near to the goal(block) value specified.
      44             :  *
      45             :  * During initialization phase of the allocator we decide to use the
      46             :  * group preallocation or inode preallocation depending on the size of
      47             :  * the file. The size of the file could be the resulting file size we
      48             :  * would have after allocation, or the current file size, which ever
      49             :  * is larger. If the size is less than sbi->s_mb_stream_request we
      50             :  * select to use the group preallocation. The default value of
      51             :  * s_mb_stream_request is 16 blocks. This can also be tuned via
      52             :  * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
      53             :  * terms of number of blocks.
      54             :  *
      55             :  * The main motivation for having small file use group preallocation is to
      56             :  * ensure that we have small files closer together on the disk.
      57             :  *
      58             :  * First stage the allocator looks at the inode prealloc list,
      59             :  * ext4_inode_info->i_prealloc_list, which contains list of prealloc
      60             :  * spaces for this particular inode. The inode prealloc space is
      61             :  * represented as:
      62             :  *
      63             :  * pa_lstart -> the logical start block for this prealloc space
      64             :  * pa_pstart -> the physical start block for this prealloc space
      65             :  * pa_len    -> length for this prealloc space (in clusters)
      66             :  * pa_free   ->  free space available in this prealloc space (in clusters)
      67             :  *
      68             :  * The inode preallocation space is used looking at the _logical_ start
      69             :  * block. If only the logical file block falls within the range of prealloc
      70             :  * space we will consume the particular prealloc space. This makes sure that
      71             :  * we have contiguous physical blocks representing the file blocks
      72             :  *
      73             :  * The important thing to be noted in case of inode prealloc space is that
      74             :  * we don't modify the values associated to inode prealloc space except
      75             :  * pa_free.
      76             :  *
      77             :  * If we are not able to find blocks in the inode prealloc space and if we
      78             :  * have the group allocation flag set then we look at the locality group
      79             :  * prealloc space. These are per CPU prealloc list represented as
      80             :  *
      81             :  * ext4_sb_info.s_locality_groups[smp_processor_id()]
      82             :  *
      83             :  * The reason for having a per cpu locality group is to reduce the contention
      84             :  * between CPUs. It is possible to get scheduled at this point.
      85             :  *
      86             :  * The locality group prealloc space is used looking at whether we have
      87             :  * enough free space (pa_free) within the prealloc space.
      88             :  *
      89             :  * If we can't allocate blocks via inode prealloc or/and locality group
      90             :  * prealloc then we look at the buddy cache. The buddy cache is represented
      91             :  * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
      92             :  * mapped to the buddy and bitmap information regarding different
      93             :  * groups. The buddy information is attached to buddy cache inode so that
      94             :  * we can access them through the page cache. The information regarding
      95             :  * each group is loaded via ext4_mb_load_buddy.  The information involve
      96             :  * block bitmap and buddy information. The information are stored in the
      97             :  * inode as:
      98             :  *
      99             :  *  {                        page                        }
     100             :  *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
     101             :  *
     102             :  *
     103             :  * one block each for bitmap and buddy information.  So for each group we
     104             :  * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
     105             :  * blocksize) blocks.  So it can have information regarding groups_per_page
     106             :  * which is blocks_per_page/2
     107             :  *
     108             :  * The buddy cache inode is not stored on disk. The inode is thrown
     109             :  * away when the filesystem is unmounted.
     110             :  *
     111             :  * We look for count number of blocks in the buddy cache. If we were able
     112             :  * to locate that many free blocks we return with additional information
     113             :  * regarding rest of the contiguous physical block available
     114             :  *
     115             :  * Before allocating blocks via buddy cache we normalize the request
     116             :  * blocks. This ensure we ask for more blocks that we needed. The extra
     117             :  * blocks that we get after allocation is added to the respective prealloc
     118             :  * list. In case of inode preallocation we follow a list of heuristics
     119             :  * based on file size. This can be found in ext4_mb_normalize_request. If
     120             :  * we are doing a group prealloc we try to normalize the request to
     121             :  * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
     122             :  * dependent on the cluster size; for non-bigalloc file systems, it is
     123             :  * 512 blocks. This can be tuned via
     124             :  * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
     125             :  * terms of number of blocks. If we have mounted the file system with -O
     126             :  * stripe=<value> option the group prealloc request is normalized to the
     127             :  * smallest multiple of the stripe value (sbi->s_stripe) which is
     128             :  * greater than the default mb_group_prealloc.
     129             :  *
     130             :  * The regular allocator (using the buddy cache) supports a few tunables.
     131             :  *
     132             :  * /sys/fs/ext4/<partition>/mb_min_to_scan
     133             :  * /sys/fs/ext4/<partition>/mb_max_to_scan
     134             :  * /sys/fs/ext4/<partition>/mb_order2_req
     135             :  *
     136             :  * The regular allocator uses buddy scan only if the request len is power of
     137             :  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
     138             :  * value of s_mb_order2_reqs can be tuned via
     139             :  * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
     140             :  * stripe size (sbi->s_stripe), we try to search for contiguous block in
     141             :  * stripe size. This should result in better allocation on RAID setups. If
     142             :  * not, we search in the specific group using bitmap for best extents. The
     143             :  * tunable min_to_scan and max_to_scan control the behaviour here.
     144             :  * min_to_scan indicate how long the mballoc __must__ look for a best
     145             :  * extent and max_to_scan indicates how long the mballoc __can__ look for a
     146             :  * best extent in the found extents. Searching for the blocks starts with
     147             :  * the group specified as the goal value in allocation context via
     148             :  * ac_g_ex. Each group is first checked based on the criteria whether it
     149             :  * can be used for allocation. ext4_mb_good_group explains how the groups are
     150             :  * checked.
     151             :  *
     152             :  * Both the prealloc space are getting populated as above. So for the first
     153             :  * request we will hit the buddy cache which will result in this prealloc
     154             :  * space getting filled. The prealloc space is then later used for the
     155             :  * subsequent request.
     156             :  */
     157             : 
     158             : /*
     159             :  * mballoc operates on the following data:
     160             :  *  - on-disk bitmap
     161             :  *  - in-core buddy (actually includes buddy and bitmap)
     162             :  *  - preallocation descriptors (PAs)
     163             :  *
     164             :  * there are two types of preallocations:
     165             :  *  - inode
     166             :  *    assiged to specific inode and can be used for this inode only.
     167             :  *    it describes part of inode's space preallocated to specific
     168             :  *    physical blocks. any block from that preallocated can be used
     169             :  *    independent. the descriptor just tracks number of blocks left
     170             :  *    unused. so, before taking some block from descriptor, one must
     171             :  *    make sure corresponded logical block isn't allocated yet. this
     172             :  *    also means that freeing any block within descriptor's range
     173             :  *    must discard all preallocated blocks.
     174             :  *  - locality group
     175             :  *    assigned to specific locality group which does not translate to
     176             :  *    permanent set of inodes: inode can join and leave group. space
     177             :  *    from this type of preallocation can be used for any inode. thus
     178             :  *    it's consumed from the beginning to the end.
     179             :  *
     180             :  * relation between them can be expressed as:
     181             :  *    in-core buddy = on-disk bitmap + preallocation descriptors
     182             :  *
     183             :  * this mean blocks mballoc considers used are:
     184             :  *  - allocated blocks (persistent)
     185             :  *  - preallocated blocks (non-persistent)
     186             :  *
     187             :  * consistency in mballoc world means that at any time a block is either
     188             :  * free or used in ALL structures. notice: "any time" should not be read
     189             :  * literally -- time is discrete and delimited by locks.
     190             :  *
     191             :  *  to keep it simple, we don't use block numbers, instead we count number of
     192             :  *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
     193             :  *
     194             :  * all operations can be expressed as:
     195             :  *  - init buddy:                       buddy = on-disk + PAs
     196             :  *  - new PA:                           buddy += N; PA = N
     197             :  *  - use inode PA:                     on-disk += N; PA -= N
     198             :  *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
     199             :  *  - use locality group PA             on-disk += N; PA -= N
     200             :  *  - discard locality group PA         buddy -= PA; PA = 0
     201             :  *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
     202             :  *        is used in real operation because we can't know actual used
     203             :  *        bits from PA, only from on-disk bitmap
     204             :  *
     205             :  * if we follow this strict logic, then all operations above should be atomic.
     206             :  * given some of them can block, we'd have to use something like semaphores
     207             :  * killing performance on high-end SMP hardware. let's try to relax it using
     208             :  * the following knowledge:
     209             :  *  1) if buddy is referenced, it's already initialized
     210             :  *  2) while block is used in buddy and the buddy is referenced,
     211             :  *     nobody can re-allocate that block
     212             :  *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
     213             :  *     bit set and PA claims same block, it's OK. IOW, one can set bit in
     214             :  *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
     215             :  *     block
     216             :  *
     217             :  * so, now we're building a concurrency table:
     218             :  *  - init buddy vs.
     219             :  *    - new PA
     220             :  *      blocks for PA are allocated in the buddy, buddy must be referenced
     221             :  *      until PA is linked to allocation group to avoid concurrent buddy init
     222             :  *    - use inode PA
     223             :  *      we need to make sure that either on-disk bitmap or PA has uptodate data
     224             :  *      given (3) we care that PA-=N operation doesn't interfere with init
     225             :  *    - discard inode PA
     226             :  *      the simplest way would be to have buddy initialized by the discard
     227             :  *    - use locality group PA
     228             :  *      again PA-=N must be serialized with init
     229             :  *    - discard locality group PA
     230             :  *      the simplest way would be to have buddy initialized by the discard
     231             :  *  - new PA vs.
     232             :  *    - use inode PA
     233             :  *      i_data_sem serializes them
     234             :  *    - discard inode PA
     235             :  *      discard process must wait until PA isn't used by another process
     236             :  *    - use locality group PA
     237             :  *      some mutex should serialize them
     238             :  *    - discard locality group PA
     239             :  *      discard process must wait until PA isn't used by another process
     240             :  *  - use inode PA
     241             :  *    - use inode PA
     242             :  *      i_data_sem or another mutex should serializes them
     243             :  *    - discard inode PA
     244             :  *      discard process must wait until PA isn't used by another process
     245             :  *    - use locality group PA
     246             :  *      nothing wrong here -- they're different PAs covering different blocks
     247             :  *    - discard locality group PA
     248             :  *      discard process must wait until PA isn't used by another process
     249             :  *
     250             :  * now we're ready to make few consequences:
     251             :  *  - PA is referenced and while it is no discard is possible
     252             :  *  - PA is referenced until block isn't marked in on-disk bitmap
     253             :  *  - PA changes only after on-disk bitmap
     254             :  *  - discard must not compete with init. either init is done before
     255             :  *    any discard or they're serialized somehow
     256             :  *  - buddy init as sum of on-disk bitmap and PAs is done atomically
     257             :  *
     258             :  * a special case when we've used PA to emptiness. no need to modify buddy
     259             :  * in this case, but we should care about concurrent init
     260             :  *
     261             :  */
     262             : 
     263             :  /*
     264             :  * Logic in few words:
     265             :  *
     266             :  *  - allocation:
     267             :  *    load group
     268             :  *    find blocks
     269             :  *    mark bits in on-disk bitmap
     270             :  *    release group
     271             :  *
     272             :  *  - use preallocation:
     273             :  *    find proper PA (per-inode or group)
     274             :  *    load group
     275             :  *    mark bits in on-disk bitmap
     276             :  *    release group
     277             :  *    release PA
     278             :  *
     279             :  *  - free:
     280             :  *    load group
     281             :  *    mark bits in on-disk bitmap
     282             :  *    release group
     283             :  *
     284             :  *  - discard preallocations in group:
     285             :  *    mark PAs deleted
     286             :  *    move them onto local list
     287             :  *    load on-disk bitmap
     288             :  *    load group
     289             :  *    remove PA from object (inode or locality group)
     290             :  *    mark free blocks in-core
     291             :  *
     292             :  *  - discard inode's preallocations:
     293             :  */
     294             : 
     295             : /*
     296             :  * Locking rules
     297             :  *
     298             :  * Locks:
     299             :  *  - bitlock on a group        (group)
     300             :  *  - object (inode/locality)   (object)
     301             :  *  - per-pa lock               (pa)
     302             :  *
     303             :  * Paths:
     304             :  *  - new pa
     305             :  *    object
     306             :  *    group
     307             :  *
     308             :  *  - find and use pa:
     309             :  *    pa
     310             :  *
     311             :  *  - release consumed pa:
     312             :  *    pa
     313             :  *    group
     314             :  *    object
     315             :  *
     316             :  *  - generate in-core bitmap:
     317             :  *    group
     318             :  *        pa
     319             :  *
     320             :  *  - discard all for given object (inode, locality group):
     321             :  *    object
     322             :  *        pa
     323             :  *    group
     324             :  *
     325             :  *  - discard all for given group:
     326             :  *    group
     327             :  *        pa
     328             :  *    group
     329             :  *        object
     330             :  *
     331             :  */
     332             : static struct kmem_cache *ext4_pspace_cachep;
     333             : static struct kmem_cache *ext4_ac_cachep;
     334             : static struct kmem_cache *ext4_free_data_cachep;
     335             : 
     336             : /* We create slab caches for groupinfo data structures based on the
     337             :  * superblock block size.  There will be one per mounted filesystem for
     338             :  * each unique s_blocksize_bits */
     339             : #define NR_GRPINFO_CACHES 8
     340             : static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
     341             : 
     342             : static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
     343             :         "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
     344             :         "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
     345             :         "ext4_groupinfo_64k", "ext4_groupinfo_128k"
     346             : };
     347             : 
     348             : static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
     349             :                                         ext4_group_t group);
     350             : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
     351             :                                                 ext4_group_t group);
     352             : static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
     353             : 
     354             : /*
     355             :  * The algorithm using this percpu seq counter goes below:
     356             :  * 1. We sample the percpu discard_pa_seq counter before trying for block
     357             :  *    allocation in ext4_mb_new_blocks().
     358             :  * 2. We increment this percpu discard_pa_seq counter when we either allocate
     359             :  *    or free these blocks i.e. while marking those blocks as used/free in
     360             :  *    mb_mark_used()/mb_free_blocks().
     361             :  * 3. We also increment this percpu seq counter when we successfully identify
     362             :  *    that the bb_prealloc_list is not empty and hence proceed for discarding
     363             :  *    of those PAs inside ext4_mb_discard_group_preallocations().
     364             :  *
     365             :  * Now to make sure that the regular fast path of block allocation is not
     366             :  * affected, as a small optimization we only sample the percpu seq counter
     367             :  * on that cpu. Only when the block allocation fails and when freed blocks
     368             :  * found were 0, that is when we sample percpu seq counter for all cpus using
     369             :  * below function ext4_get_discard_pa_seq_sum(). This happens after making
     370             :  * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
     371             :  */
     372             : static DEFINE_PER_CPU(u64, discard_pa_seq);
     373           0 : static inline u64 ext4_get_discard_pa_seq_sum(void)
     374             : {
     375           0 :         int __cpu;
     376           0 :         u64 __seq = 0;
     377             : 
     378           0 :         for_each_possible_cpu(__cpu)
     379           0 :                 __seq += per_cpu(discard_pa_seq, __cpu);
     380           0 :         return __seq;
     381             : }
     382             : 
     383       57552 : static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
     384             : {
     385             : #if BITS_PER_LONG == 64
     386       57552 :         *bit += ((unsigned long) addr & 7UL) << 3;
     387       57552 :         addr = (void *) ((unsigned long) addr & ~7UL);
     388             : #elif BITS_PER_LONG == 32
     389             :         *bit += ((unsigned long) addr & 3UL) << 3;
     390             :         addr = (void *) ((unsigned long) addr & ~3UL);
     391             : #else
     392             : #error "how many bits you are?!"
     393             : #endif
     394       57552 :         return addr;
     395             : }
     396             : 
     397       34612 : static inline int mb_test_bit(int bit, void *addr)
     398             : {
     399             :         /*
     400             :          * ext4_test_bit on architecture like powerpc
     401             :          * needs unsigned long aligned address
     402             :          */
     403       34612 :         addr = mb_correct_addr_and_bit(&bit, addr);
     404       34612 :         return ext4_test_bit(bit, addr);
     405             : }
     406             : 
     407        1149 : static inline void mb_set_bit(int bit, void *addr)
     408             : {
     409        1149 :         addr = mb_correct_addr_and_bit(&bit, addr);
     410        1149 :         ext4_set_bit(bit, addr);
     411        1149 : }
     412             : 
     413        8965 : static inline void mb_clear_bit(int bit, void *addr)
     414             : {
     415        8965 :         addr = mb_correct_addr_and_bit(&bit, addr);
     416        8965 :         ext4_clear_bit(bit, addr);
     417        8965 : }
     418             : 
     419         830 : static inline int mb_test_and_clear_bit(int bit, void *addr)
     420             : {
     421         830 :         addr = mb_correct_addr_and_bit(&bit, addr);
     422         830 :         return ext4_test_and_clear_bit(bit, addr);
     423             : }
     424             : 
     425        7500 : static inline int mb_find_next_zero_bit(void *addr, int max, int start)
     426             : {
     427        7500 :         int fix = 0, ret, tmpmax;
     428        7500 :         addr = mb_correct_addr_and_bit(&fix, addr);
     429        7500 :         tmpmax = max + fix;
     430        7500 :         start += fix;
     431             : 
     432        7500 :         ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
     433        7500 :         if (ret > max)
     434             :                 return max;
     435             :         return ret;
     436             : }
     437             : 
     438        4496 : static inline int mb_find_next_bit(void *addr, int max, int start)
     439             : {
     440        4496 :         int fix = 0, ret, tmpmax;
     441        4496 :         addr = mb_correct_addr_and_bit(&fix, addr);
     442        4496 :         tmpmax = max + fix;
     443        4496 :         start += fix;
     444             : 
     445        4496 :         ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
     446        4496 :         if (ret > max)
     447             :                 return max;
     448             :         return ret;
     449             : }
     450             : 
     451       34354 : static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
     452             : {
     453       34354 :         char *bb;
     454             : 
     455       34354 :         BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
     456       34354 :         BUG_ON(max == NULL);
     457             : 
     458       34354 :         if (order > e4b->bd_blkbits + 1) {
     459           0 :                 *max = 0;
     460           0 :                 return NULL;
     461             :         }
     462             : 
     463             :         /* at order 0 we see each particular block */
     464       34354 :         if (order == 0) {
     465        4866 :                 *max = 1 << (e4b->bd_blkbits + 3);
     466        4866 :                 return e4b->bd_bitmap;
     467             :         }
     468             : 
     469       29488 :         bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
     470       29488 :         *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
     471             : 
     472       29488 :         return bb;
     473             : }
     474             : 
     475             : #ifdef DOUBLE_CHECK
     476             : static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
     477             :                            int first, int count)
     478             : {
     479             :         int i;
     480             :         struct super_block *sb = e4b->bd_sb;
     481             : 
     482             :         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
     483             :                 return;
     484             :         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
     485             :         for (i = 0; i < count; i++) {
     486             :                 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
     487             :                         ext4_fsblk_t blocknr;
     488             : 
     489             :                         blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
     490             :                         blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
     491             :                         ext4_grp_locked_error(sb, e4b->bd_group,
     492             :                                               inode ? inode->i_ino : 0,
     493             :                                               blocknr,
     494             :                                               "freeing block already freed "
     495             :                                               "(bit %u)",
     496             :                                               first + i);
     497             :                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
     498             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
     499             :                 }
     500             :                 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
     501             :         }
     502             : }
     503             : 
     504             : static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
     505             : {
     506             :         int i;
     507             : 
     508             :         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
     509             :                 return;
     510             :         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
     511             :         for (i = 0; i < count; i++) {
     512             :                 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
     513             :                 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
     514             :         }
     515             : }
     516             : 
     517             : static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
     518             : {
     519             :         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
     520             :                 return;
     521             :         if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
     522             :                 unsigned char *b1, *b2;
     523             :                 int i;
     524             :                 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
     525             :                 b2 = (unsigned char *) bitmap;
     526             :                 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
     527             :                         if (b1[i] != b2[i]) {
     528             :                                 ext4_msg(e4b->bd_sb, KERN_ERR,
     529             :                                          "corruption in group %u "
     530             :                                          "at byte %u(%u): %x in copy != %x "
     531             :                                          "on disk/prealloc",
     532             :                                          e4b->bd_group, i, i * 8, b1[i], b2[i]);
     533             :                                 BUG();
     534             :                         }
     535             :                 }
     536             :         }
     537             : }
     538             : 
     539             : static void mb_group_bb_bitmap_alloc(struct super_block *sb,
     540             :                         struct ext4_group_info *grp, ext4_group_t group)
     541             : {
     542             :         struct buffer_head *bh;
     543             : 
     544             :         grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
     545             :         if (!grp->bb_bitmap)
     546             :                 return;
     547             : 
     548             :         bh = ext4_read_block_bitmap(sb, group);
     549             :         if (IS_ERR_OR_NULL(bh)) {
     550             :                 kfree(grp->bb_bitmap);
     551             :                 grp->bb_bitmap = NULL;
     552             :                 return;
     553             :         }
     554             : 
     555             :         memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
     556             :         put_bh(bh);
     557             : }
     558             : 
     559             : static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
     560             : {
     561             :         kfree(grp->bb_bitmap);
     562             : }
     563             : 
     564             : #else
     565         166 : static inline void mb_free_blocks_double(struct inode *inode,
     566             :                                 struct ext4_buddy *e4b, int first, int count)
     567             : {
     568         166 :         return;
     569             : }
     570         197 : static inline void mb_mark_used_double(struct ext4_buddy *e4b,
     571             :                                                 int first, int count)
     572             : {
     573         197 :         return;
     574             : }
     575             : static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
     576             : {
     577             :         return;
     578             : }
     579             : 
     580          16 : static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
     581             :                         struct ext4_group_info *grp, ext4_group_t group)
     582             : {
     583          16 :         return;
     584             : }
     585             : 
     586           0 : static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
     587             : {
     588           0 :         return;
     589             : }
     590             : #endif
     591             : 
     592             : #ifdef AGGRESSIVE_CHECK
     593             : 
     594             : #define MB_CHECK_ASSERT(assert)                                         \
     595             : do {                                                                    \
     596             :         if (!(assert)) {                                                \
     597             :                 printk(KERN_EMERG                                       \
     598             :                         "Assertion failure in %s() at %s:%d: \"%s\"\n",     \
     599             :                         function, file, line, # assert);                \
     600             :                 BUG();                                                  \
     601             :         }                                                               \
     602             : } while (0)
     603             : 
     604             : static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
     605             :                                 const char *function, int line)
     606             : {
     607             :         struct super_block *sb = e4b->bd_sb;
     608             :         int order = e4b->bd_blkbits + 1;
     609             :         int max;
     610             :         int max2;
     611             :         int i;
     612             :         int j;
     613             :         int k;
     614             :         int count;
     615             :         struct ext4_group_info *grp;
     616             :         int fragments = 0;
     617             :         int fstart;
     618             :         struct list_head *cur;
     619             :         void *buddy;
     620             :         void *buddy2;
     621             : 
     622             :         if (e4b->bd_info->bb_check_counter++ % 10)
     623             :                 return 0;
     624             : 
     625             :         while (order > 1) {
     626             :                 buddy = mb_find_buddy(e4b, order, &max);
     627             :                 MB_CHECK_ASSERT(buddy);
     628             :                 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
     629             :                 MB_CHECK_ASSERT(buddy2);
     630             :                 MB_CHECK_ASSERT(buddy != buddy2);
     631             :                 MB_CHECK_ASSERT(max * 2 == max2);
     632             : 
     633             :                 count = 0;
     634             :                 for (i = 0; i < max; i++) {
     635             : 
     636             :                         if (mb_test_bit(i, buddy)) {
     637             :                                 /* only single bit in buddy2 may be 1 */
     638             :                                 if (!mb_test_bit(i << 1, buddy2)) {
     639             :                                         MB_CHECK_ASSERT(
     640             :                                                 mb_test_bit((i<<1)+1, buddy2));
     641             :                                 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
     642             :                                         MB_CHECK_ASSERT(
     643             :                                                 mb_test_bit(i << 1, buddy2));
     644             :                                 }
     645             :                                 continue;
     646             :                         }
     647             : 
     648             :                         /* both bits in buddy2 must be 1 */
     649             :                         MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
     650             :                         MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
     651             : 
     652             :                         for (j = 0; j < (1 << order); j++) {
     653             :                                 k = (i * (1 << order)) + j;
     654             :                                 MB_CHECK_ASSERT(
     655             :                                         !mb_test_bit(k, e4b->bd_bitmap));
     656             :                         }
     657             :                         count++;
     658             :                 }
     659             :                 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
     660             :                 order--;
     661             :         }
     662             : 
     663             :         fstart = -1;
     664             :         buddy = mb_find_buddy(e4b, 0, &max);
     665             :         for (i = 0; i < max; i++) {
     666             :                 if (!mb_test_bit(i, buddy)) {
     667             :                         MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
     668             :                         if (fstart == -1) {
     669             :                                 fragments++;
     670             :                                 fstart = i;
     671             :                         }
     672             :                         continue;
     673             :                 }
     674             :                 fstart = -1;
     675             :                 /* check used bits only */
     676             :                 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
     677             :                         buddy2 = mb_find_buddy(e4b, j, &max2);
     678             :                         k = i >> j;
     679             :                         MB_CHECK_ASSERT(k < max2);
     680             :                         MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
     681             :                 }
     682             :         }
     683             :         MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
     684             :         MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
     685             : 
     686             :         grp = ext4_get_group_info(sb, e4b->bd_group);
     687             :         list_for_each(cur, &grp->bb_prealloc_list) {
     688             :                 ext4_group_t groupnr;
     689             :                 struct ext4_prealloc_space *pa;
     690             :                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
     691             :                 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
     692             :                 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
     693             :                 for (i = 0; i < pa->pa_len; i++)
     694             :                         MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
     695             :         }
     696             :         return 0;
     697             : }
     698             : #undef MB_CHECK_ASSERT
     699             : #define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
     700             :                                         __FILE__, __func__, __LINE__)
     701             : #else
     702             : #define mb_check_buddy(e4b)
     703             : #endif
     704             : 
     705             : /*
     706             :  * Divide blocks started from @first with length @len into
     707             :  * smaller chunks with power of 2 blocks.
     708             :  * Clear the bits in bitmap which the blocks of the chunk(s) covered,
     709             :  * then increase bb_counters[] for corresponded chunk size.
     710             :  */
     711        3362 : static void ext4_mb_mark_free_simple(struct super_block *sb,
     712             :                                 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
     713             :                                         struct ext4_group_info *grp)
     714             : {
     715        3362 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     716        3362 :         ext4_grpblk_t min;
     717        3362 :         ext4_grpblk_t max;
     718        3362 :         ext4_grpblk_t chunk;
     719        3362 :         unsigned int border;
     720             : 
     721        3362 :         BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
     722             : 
     723        3362 :         border = 2 << sb->s_blocksize_bits;
     724             : 
     725       13508 :         while (len > 0) {
     726             :                 /* find how many blocks can be covered since this position */
     727       10146 :                 max = ffs(first | border) - 1;
     728             : 
     729             :                 /* find how many blocks of power 2 we need to mark */
     730       10146 :                 min = fls(len) - 1;
     731             : 
     732       10146 :                 if (max < min)
     733             :                         min = max;
     734       10146 :                 chunk = 1 << min;
     735             : 
     736             :                 /* mark multiblock chunks only */
     737       10146 :                 grp->bb_counters[min]++;
     738       10146 :                 if (min > 0)
     739        7829 :                         mb_clear_bit(first >> min,
     740        7829 :                                      buddy + sbi->s_mb_offsets[min]);
     741             : 
     742       10146 :                 len -= chunk;
     743       10146 :                 first += chunk;
     744             :         }
     745        3362 : }
     746             : 
     747             : /*
     748             :  * Cache the order of the largest free extent we have available in this block
     749             :  * group.
     750             :  */
     751             : static void
     752         379 : mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
     753             : {
     754         379 :         int i;
     755         379 :         int bits;
     756             : 
     757         379 :         grp->bb_largest_free_order = -1; /* uninit */
     758             : 
     759         379 :         bits = sb->s_blocksize_bits + 1;
     760        2954 :         for (i = bits; i >= 0; i--) {
     761        2954 :                 if (grp->bb_counters[i] > 0) {
     762         379 :                         grp->bb_largest_free_order = i;
     763         379 :                         break;
     764             :                 }
     765             :         }
     766             : }
     767             : 
     768             : static noinline_for_stack
     769          16 : void ext4_mb_generate_buddy(struct super_block *sb,
     770             :                                 void *buddy, void *bitmap, ext4_group_t group)
     771             : {
     772          16 :         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
     773          16 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     774          16 :         ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
     775          16 :         ext4_grpblk_t i = 0;
     776          16 :         ext4_grpblk_t first;
     777          16 :         ext4_grpblk_t len;
     778          16 :         unsigned free = 0;
     779          16 :         unsigned fragments = 0;
     780          16 :         unsigned long long period = get_cycles();
     781             : 
     782             :         /* initialize buddy from bitmap which is aggregation
     783             :          * of on-disk bitmap and preallocations */
     784          16 :         i = mb_find_next_zero_bit(bitmap, max, 0);
     785          16 :         grp->bb_first_free = i;
     786        4510 :         while (i < max) {
     787        4494 :                 fragments++;
     788        4494 :                 first = i;
     789        4494 :                 i = mb_find_next_bit(bitmap, max, i);
     790        4494 :                 len = i - first;
     791        4494 :                 free += len;
     792        4494 :                 if (len > 1)
     793        3362 :                         ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
     794             :                 else
     795        1132 :                         grp->bb_counters[0]++;
     796        4494 :                 if (i < max)
     797        4486 :                         i = mb_find_next_zero_bit(bitmap, max, i);
     798             :         }
     799          16 :         grp->bb_fragments = fragments;
     800             : 
     801          16 :         if (free != grp->bb_free) {
     802           0 :                 ext4_grp_locked_error(sb, group, 0, 0,
     803             :                                       "block bitmap and bg descriptor "
     804             :                                       "inconsistent: %u vs %u free clusters",
     805             :                                       free, grp->bb_free);
     806             :                 /*
     807             :                  * If we intend to continue, we consider group descriptor
     808             :                  * corrupt and update bb_free using bitmap value
     809             :                  */
     810           0 :                 grp->bb_free = free;
     811           0 :                 ext4_mark_group_bitmap_corrupted(sb, group,
     812             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
     813             :         }
     814          16 :         mb_set_largest_free_order(sb, grp);
     815             : 
     816          16 :         clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
     817             : 
     818          16 :         period = get_cycles() - period;
     819          16 :         spin_lock(&sbi->s_bal_lock);
     820          16 :         sbi->s_mb_buddies_generated++;
     821          16 :         sbi->s_mb_generation_time += period;
     822          16 :         spin_unlock(&sbi->s_bal_lock);
     823          16 : }
     824             : 
     825             : /* The buddy information is attached the buddy cache inode
     826             :  * for convenience. The information regarding each group
     827             :  * is loaded via ext4_mb_load_buddy. The information involve
     828             :  * block bitmap and buddy information. The information are
     829             :  * stored in the inode as
     830             :  *
     831             :  * {                        page                        }
     832             :  * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
     833             :  *
     834             :  *
     835             :  * one block each for bitmap and buddy information.
     836             :  * So for each group we take up 2 blocks. A page can
     837             :  * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
     838             :  * So it can have information regarding groups_per_page which
     839             :  * is blocks_per_page/2
     840             :  *
     841             :  * Locking note:  This routine takes the block group lock of all groups
     842             :  * for this page; do not hold this lock when calling this routine!
     843             :  */
     844             : 
     845          32 : static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
     846             : {
     847          32 :         ext4_group_t ngroups;
     848          32 :         int blocksize;
     849          32 :         int blocks_per_page;
     850          32 :         int groups_per_page;
     851          32 :         int err = 0;
     852          32 :         int i;
     853          32 :         ext4_group_t first_group, group;
     854          32 :         int first_block;
     855          32 :         struct super_block *sb;
     856          32 :         struct buffer_head *bhs;
     857          32 :         struct buffer_head **bh = NULL;
     858          32 :         struct inode *inode;
     859          32 :         char *data;
     860          32 :         char *bitmap;
     861          32 :         struct ext4_group_info *grinfo;
     862             : 
     863          32 :         inode = page->mapping->host;
     864          32 :         sb = inode->i_sb;
     865          32 :         ngroups = ext4_get_groups_count(sb);
     866          32 :         blocksize = i_blocksize(inode);
     867          32 :         blocks_per_page = PAGE_SIZE / blocksize;
     868             : 
     869          32 :         mb_debug(sb, "init page %lu\n", page->index);
     870             : 
     871          32 :         groups_per_page = blocks_per_page >> 1;
     872          32 :         if (groups_per_page == 0)
     873             :                 groups_per_page = 1;
     874             : 
     875             :         /* allocate buffer_heads to read bitmaps */
     876           0 :         if (groups_per_page > 1) {
     877           0 :                 i = sizeof(struct buffer_head *) * groups_per_page;
     878           0 :                 bh = kzalloc(i, gfp);
     879           0 :                 if (bh == NULL) {
     880           0 :                         err = -ENOMEM;
     881           0 :                         goto out;
     882             :                 }
     883             :         } else
     884             :                 bh = &bhs;
     885             : 
     886          32 :         first_group = page->index * blocks_per_page / 2;
     887             : 
     888             :         /* read all groups the page covers into the cache */
     889          64 :         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
     890          32 :                 if (group >= ngroups)
     891             :                         break;
     892             : 
     893          32 :                 grinfo = ext4_get_group_info(sb, group);
     894             :                 /*
     895             :                  * If page is uptodate then we came here after online resize
     896             :                  * which added some new uninitialized group info structs, so
     897             :                  * we must skip all initialized uptodate buddies on the page,
     898             :                  * which may be currently in use by an allocating task.
     899             :                  */
     900          32 :                 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
     901           0 :                         bh[i] = NULL;
     902           0 :                         continue;
     903             :                 }
     904          32 :                 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
     905          32 :                 if (IS_ERR(bh[i])) {
     906           0 :                         err = PTR_ERR(bh[i]);
     907           0 :                         bh[i] = NULL;
     908           0 :                         goto out;
     909             :                 }
     910             :                 mb_debug(sb, "read bitmap for group %u\n", group);
     911             :         }
     912             : 
     913             :         /* wait for I/O completion */
     914          64 :         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
     915          32 :                 int err2;
     916             : 
     917          32 :                 if (!bh[i])
     918           0 :                         continue;
     919          32 :                 err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
     920          32 :                 if (!err)
     921          32 :                         err = err2;
     922             :         }
     923             : 
     924          32 :         first_block = page->index * blocks_per_page;
     925          64 :         for (i = 0; i < blocks_per_page; i++) {
     926          32 :                 group = (first_block + i) >> 1;
     927          32 :                 if (group >= ngroups)
     928             :                         break;
     929             : 
     930          32 :                 if (!bh[group - first_group])
     931             :                         /* skip initialized uptodate buddy */
     932           0 :                         continue;
     933             : 
     934          32 :                 if (!buffer_verified(bh[group - first_group]))
     935             :                         /* Skip faulty bitmaps */
     936           0 :                         continue;
     937          32 :                 err = 0;
     938             : 
     939             :                 /*
     940             :                  * data carry information regarding this
     941             :                  * particular group in the format specified
     942             :                  * above
     943             :                  *
     944             :                  */
     945          32 :                 data = page_address(page) + (i * blocksize);
     946          32 :                 bitmap = bh[group - first_group]->b_data;
     947             : 
     948             :                 /*
     949             :                  * We place the buddy block and bitmap block
     950             :                  * close together
     951             :                  */
     952          32 :                 if ((first_block + i) & 1) {
     953             :                         /* this is block of buddy */
     954          16 :                         BUG_ON(incore == NULL);
     955          16 :                         mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
     956             :                                 group, page->index, i * blocksize);
     957          16 :                         trace_ext4_mb_buddy_bitmap_load(sb, group);
     958          16 :                         grinfo = ext4_get_group_info(sb, group);
     959          16 :                         grinfo->bb_fragments = 0;
     960          16 :                         memset(grinfo->bb_counters, 0,
     961             :                                sizeof(*grinfo->bb_counters) *
     962          16 :                                 (sb->s_blocksize_bits+2));
     963             :                         /*
     964             :                          * incore got set to the group block bitmap below
     965             :                          */
     966          16 :                         ext4_lock_group(sb, group);
     967             :                         /* init the buddy */
     968          16 :                         memset(data, 0xff, blocksize);
     969          16 :                         ext4_mb_generate_buddy(sb, data, incore, group);
     970          16 :                         ext4_unlock_group(sb, group);
     971          16 :                         incore = NULL;
     972             :                 } else {
     973             :                         /* this is block of bitmap */
     974          16 :                         BUG_ON(incore != NULL);
     975          16 :                         mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
     976             :                                 group, page->index, i * blocksize);
     977          16 :                         trace_ext4_mb_bitmap_load(sb, group);
     978             : 
     979             :                         /* see comments in ext4_mb_put_pa() */
     980          16 :                         ext4_lock_group(sb, group);
     981          16 :                         memcpy(data, bitmap, blocksize);
     982             : 
     983             :                         /* mark all preallocated blks used in in-core bitmap */
     984          16 :                         ext4_mb_generate_from_pa(sb, data, group);
     985          16 :                         ext4_mb_generate_from_freelist(sb, data, group);
     986          16 :                         ext4_unlock_group(sb, group);
     987             : 
     988             :                         /* set incore so that the buddy information can be
     989             :                          * generated using this
     990             :                          */
     991          16 :                         incore = data;
     992             :                 }
     993             :         }
     994          32 :         SetPageUptodate(page);
     995             : 
     996          32 : out:
     997          32 :         if (bh) {
     998          64 :                 for (i = 0; i < groups_per_page; i++)
     999          64 :                         brelse(bh[i]);
    1000          32 :                 if (bh != &bhs)
    1001           0 :                         kfree(bh);
    1002             :         }
    1003          32 :         return err;
    1004             : }
    1005             : 
    1006             : /*
    1007             :  * Lock the buddy and bitmap pages. This make sure other parallel init_group
    1008             :  * on the same buddy page doesn't happen whild holding the buddy page lock.
    1009             :  * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
    1010             :  * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
    1011             :  */
    1012          16 : static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
    1013             :                 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
    1014             : {
    1015          16 :         struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
    1016          16 :         int block, pnum, poff;
    1017          16 :         int blocks_per_page;
    1018          16 :         struct page *page;
    1019             : 
    1020          16 :         e4b->bd_buddy_page = NULL;
    1021          16 :         e4b->bd_bitmap_page = NULL;
    1022             : 
    1023          16 :         blocks_per_page = PAGE_SIZE / sb->s_blocksize;
    1024             :         /*
    1025             :          * the buddy cache inode stores the block bitmap
    1026             :          * and buddy information in consecutive blocks.
    1027             :          * So for each group we need two blocks.
    1028             :          */
    1029          16 :         block = group * 2;
    1030          16 :         pnum = block / blocks_per_page;
    1031          16 :         poff = block % blocks_per_page;
    1032          16 :         page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1033          16 :         if (!page)
    1034             :                 return -ENOMEM;
    1035          16 :         BUG_ON(page->mapping != inode->i_mapping);
    1036          16 :         e4b->bd_bitmap_page = page;
    1037          16 :         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
    1038             : 
    1039          16 :         if (blocks_per_page >= 2) {
    1040             :                 /* buddy and bitmap are on the same page */
    1041             :                 return 0;
    1042             :         }
    1043             : 
    1044          16 :         block++;
    1045          16 :         pnum = block / blocks_per_page;
    1046          16 :         page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1047          16 :         if (!page)
    1048             :                 return -ENOMEM;
    1049          16 :         BUG_ON(page->mapping != inode->i_mapping);
    1050          16 :         e4b->bd_buddy_page = page;
    1051          16 :         return 0;
    1052             : }
    1053             : 
    1054          16 : static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
    1055             : {
    1056          16 :         if (e4b->bd_bitmap_page) {
    1057          16 :                 unlock_page(e4b->bd_bitmap_page);
    1058          16 :                 put_page(e4b->bd_bitmap_page);
    1059             :         }
    1060          16 :         if (e4b->bd_buddy_page) {
    1061          16 :                 unlock_page(e4b->bd_buddy_page);
    1062          16 :                 put_page(e4b->bd_buddy_page);
    1063             :         }
    1064          16 : }
    1065             : 
    1066             : /*
    1067             :  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
    1068             :  * block group lock of all groups for this page; do not hold the BG lock when
    1069             :  * calling this routine!
    1070             :  */
    1071             : static noinline_for_stack
    1072          16 : int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
    1073             : {
    1074             : 
    1075          16 :         struct ext4_group_info *this_grp;
    1076          16 :         struct ext4_buddy e4b;
    1077          16 :         struct page *page;
    1078          16 :         int ret = 0;
    1079             : 
    1080          16 :         might_sleep();
    1081          16 :         mb_debug(sb, "init group %u\n", group);
    1082          16 :         this_grp = ext4_get_group_info(sb, group);
    1083             :         /*
    1084             :          * This ensures that we don't reinit the buddy cache
    1085             :          * page which map to the group from which we are already
    1086             :          * allocating. If we are looking at the buddy cache we would
    1087             :          * have taken a reference using ext4_mb_load_buddy and that
    1088             :          * would have pinned buddy page to page cache.
    1089             :          * The call to ext4_mb_get_buddy_page_lock will mark the
    1090             :          * page accessed.
    1091             :          */
    1092          16 :         ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
    1093          16 :         if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
    1094             :                 /*
    1095             :                  * somebody initialized the group
    1096             :                  * return without doing anything
    1097             :                  */
    1098           0 :                 goto err;
    1099             :         }
    1100             : 
    1101          16 :         page = e4b.bd_bitmap_page;
    1102          16 :         ret = ext4_mb_init_cache(page, NULL, gfp);
    1103          16 :         if (ret)
    1104           0 :                 goto err;
    1105          16 :         if (!PageUptodate(page)) {
    1106           0 :                 ret = -EIO;
    1107           0 :                 goto err;
    1108             :         }
    1109             : 
    1110          16 :         if (e4b.bd_buddy_page == NULL) {
    1111             :                 /*
    1112             :                  * If both the bitmap and buddy are in
    1113             :                  * the same page we don't need to force
    1114             :                  * init the buddy
    1115             :                  */
    1116           0 :                 ret = 0;
    1117           0 :                 goto err;
    1118             :         }
    1119             :         /* init buddy cache */
    1120          16 :         page = e4b.bd_buddy_page;
    1121          16 :         ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
    1122          16 :         if (ret)
    1123           0 :                 goto err;
    1124          16 :         if (!PageUptodate(page)) {
    1125           0 :                 ret = -EIO;
    1126           0 :                 goto err;
    1127             :         }
    1128          16 : err:
    1129          16 :         ext4_mb_put_buddy_page_lock(&e4b);
    1130          16 :         return ret;
    1131             : }
    1132             : 
    1133             : /*
    1134             :  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
    1135             :  * block group lock of all groups for this page; do not hold the BG lock when
    1136             :  * calling this routine!
    1137             :  */
    1138             : static noinline_for_stack int
    1139         585 : ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
    1140             :                        struct ext4_buddy *e4b, gfp_t gfp)
    1141             : {
    1142         585 :         int blocks_per_page;
    1143         585 :         int block;
    1144         585 :         int pnum;
    1145         585 :         int poff;
    1146         585 :         struct page *page;
    1147         585 :         int ret;
    1148         585 :         struct ext4_group_info *grp;
    1149         585 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1150         585 :         struct inode *inode = sbi->s_buddy_cache;
    1151             : 
    1152         585 :         might_sleep();
    1153         585 :         mb_debug(sb, "load group %u\n", group);
    1154             : 
    1155         585 :         blocks_per_page = PAGE_SIZE / sb->s_blocksize;
    1156         585 :         grp = ext4_get_group_info(sb, group);
    1157             : 
    1158         585 :         e4b->bd_blkbits = sb->s_blocksize_bits;
    1159         585 :         e4b->bd_info = grp;
    1160         585 :         e4b->bd_sb = sb;
    1161         585 :         e4b->bd_group = group;
    1162         585 :         e4b->bd_buddy_page = NULL;
    1163         585 :         e4b->bd_bitmap_page = NULL;
    1164             : 
    1165         585 :         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
    1166             :                 /*
    1167             :                  * we need full data about the group
    1168             :                  * to make a good selection
    1169             :                  */
    1170           1 :                 ret = ext4_mb_init_group(sb, group, gfp);
    1171           1 :                 if (ret)
    1172             :                         return ret;
    1173             :         }
    1174             : 
    1175             :         /*
    1176             :          * the buddy cache inode stores the block bitmap
    1177             :          * and buddy information in consecutive blocks.
    1178             :          * So for each group we need two blocks.
    1179             :          */
    1180         585 :         block = group * 2;
    1181         585 :         pnum = block / blocks_per_page;
    1182         585 :         poff = block % blocks_per_page;
    1183             : 
    1184             :         /* we could use find_or_create_page(), but it locks page
    1185             :          * what we'd like to avoid in fast path ... */
    1186         585 :         page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
    1187         585 :         if (page == NULL || !PageUptodate(page)) {
    1188           0 :                 if (page)
    1189             :                         /*
    1190             :                          * drop the page reference and try
    1191             :                          * to get the page with lock. If we
    1192             :                          * are not uptodate that implies
    1193             :                          * somebody just created the page but
    1194             :                          * is yet to initialize the same. So
    1195             :                          * wait for it to initialize.
    1196             :                          */
    1197           0 :                         put_page(page);
    1198           0 :                 page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1199           0 :                 if (page) {
    1200           0 :                         BUG_ON(page->mapping != inode->i_mapping);
    1201           0 :                         if (!PageUptodate(page)) {
    1202           0 :                                 ret = ext4_mb_init_cache(page, NULL, gfp);
    1203           0 :                                 if (ret) {
    1204           0 :                                         unlock_page(page);
    1205           0 :                                         goto err;
    1206             :                                 }
    1207           0 :                                 mb_cmp_bitmaps(e4b, page_address(page) +
    1208           0 :                                                (poff * sb->s_blocksize));
    1209             :                         }
    1210           0 :                         unlock_page(page);
    1211             :                 }
    1212             :         }
    1213         585 :         if (page == NULL) {
    1214           0 :                 ret = -ENOMEM;
    1215           0 :                 goto err;
    1216             :         }
    1217         585 :         if (!PageUptodate(page)) {
    1218           0 :                 ret = -EIO;
    1219           0 :                 goto err;
    1220             :         }
    1221             : 
    1222             :         /* Pages marked accessed already */
    1223         585 :         e4b->bd_bitmap_page = page;
    1224         585 :         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
    1225             : 
    1226         585 :         block++;
    1227         585 :         pnum = block / blocks_per_page;
    1228         585 :         poff = block % blocks_per_page;
    1229             : 
    1230         585 :         page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
    1231         585 :         if (page == NULL || !PageUptodate(page)) {
    1232           0 :                 if (page)
    1233           0 :                         put_page(page);
    1234           0 :                 page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1235           0 :                 if (page) {
    1236           0 :                         BUG_ON(page->mapping != inode->i_mapping);
    1237           0 :                         if (!PageUptodate(page)) {
    1238           0 :                                 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
    1239             :                                                          gfp);
    1240           0 :                                 if (ret) {
    1241           0 :                                         unlock_page(page);
    1242           0 :                                         goto err;
    1243             :                                 }
    1244             :                         }
    1245           0 :                         unlock_page(page);
    1246             :                 }
    1247             :         }
    1248         585 :         if (page == NULL) {
    1249           0 :                 ret = -ENOMEM;
    1250           0 :                 goto err;
    1251             :         }
    1252         585 :         if (!PageUptodate(page)) {
    1253           0 :                 ret = -EIO;
    1254           0 :                 goto err;
    1255             :         }
    1256             : 
    1257             :         /* Pages marked accessed already */
    1258         585 :         e4b->bd_buddy_page = page;
    1259         585 :         e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
    1260             : 
    1261         585 :         return 0;
    1262             : 
    1263           0 : err:
    1264           0 :         if (page)
    1265           0 :                 put_page(page);
    1266           0 :         if (e4b->bd_bitmap_page)
    1267           0 :                 put_page(e4b->bd_bitmap_page);
    1268           0 :         if (e4b->bd_buddy_page)
    1269           0 :                 put_page(e4b->bd_buddy_page);
    1270           0 :         e4b->bd_buddy = NULL;
    1271           0 :         e4b->bd_bitmap = NULL;
    1272           0 :         return ret;
    1273             : }
    1274             : 
    1275         377 : static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
    1276             :                               struct ext4_buddy *e4b)
    1277             : {
    1278         377 :         return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
    1279             : }
    1280             : 
    1281         585 : static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
    1282             : {
    1283         585 :         if (e4b->bd_bitmap_page)
    1284         585 :                 put_page(e4b->bd_bitmap_page);
    1285         585 :         if (e4b->bd_buddy_page)
    1286         585 :                 put_page(e4b->bd_buddy_page);
    1287         585 : }
    1288             : 
    1289             : 
    1290        3640 : static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
    1291             : {
    1292        3640 :         int order = 1, max;
    1293        3640 :         void *bb;
    1294             : 
    1295        3640 :         BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
    1296        3640 :         BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
    1297             : 
    1298       30546 :         while (order <= e4b->bd_blkbits + 1) {
    1299       28700 :                 bb = mb_find_buddy(e4b, order, &max);
    1300       28700 :                 if (!mb_test_bit(block >> order, bb)) {
    1301             :                         /* this block is part of buddy of order 'order' */
    1302        1794 :                         return order;
    1303             :                 }
    1304       26906 :                 order++;
    1305             :         }
    1306             :         return 0;
    1307             : }
    1308             : 
    1309         206 : static void mb_clear_bits(void *bm, int cur, int len)
    1310             : {
    1311         206 :         __u32 *addr;
    1312             : 
    1313         206 :         len = cur + len;
    1314        1013 :         while (cur < len) {
    1315         807 :                 if ((cur & 31) == 0 && (len - cur) >= 32) {
    1316             :                         /* fast path: clear whole word at once */
    1317          16 :                         addr = bm + (cur >> 3);
    1318          16 :                         *addr = 0;
    1319          16 :                         cur += 32;
    1320          16 :                         continue;
    1321             :                 }
    1322         791 :                 mb_clear_bit(cur, bm);
    1323         791 :                 cur++;
    1324             :         }
    1325         206 : }
    1326             : 
    1327             : /* clear bits in given range
    1328             :  * will return first found zero bit if any, -1 otherwise
    1329             :  */
    1330         166 : static int mb_test_and_clear_bits(void *bm, int cur, int len)
    1331             : {
    1332         166 :         __u32 *addr;
    1333         166 :         int zero_bit = -1;
    1334             : 
    1335         166 :         len = cur + len;
    1336        1012 :         while (cur < len) {
    1337         846 :                 if ((cur & 31) == 0 && (len - cur) >= 32) {
    1338             :                         /* fast path: clear whole word at once */
    1339          16 :                         addr = bm + (cur >> 3);
    1340          16 :                         if (*addr != (__u32)(-1) && zero_bit == -1)
    1341           0 :                                 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
    1342          16 :                         *addr = 0;
    1343          16 :                         cur += 32;
    1344          16 :                         continue;
    1345             :                 }
    1346         830 :                 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
    1347           0 :                         zero_bit = cur;
    1348         830 :                 cur++;
    1349             :         }
    1350             : 
    1351         166 :         return zero_bit;
    1352             : }
    1353             : 
    1354         442 : void ext4_set_bits(void *bm, int cur, int len)
    1355             : {
    1356         442 :         __u32 *addr;
    1357             : 
    1358         442 :         len = cur + len;
    1359        1270 :         while (cur < len) {
    1360         828 :                 if ((cur & 31) == 0 && (len - cur) >= 32) {
    1361             :                         /* fast path: set whole word at once */
    1362          69 :                         addr = bm + (cur >> 3);
    1363          69 :                         *addr = 0xffffffff;
    1364          69 :                         cur += 32;
    1365          69 :                         continue;
    1366             :                 }
    1367         759 :                 mb_set_bit(cur, bm);
    1368         759 :                 cur++;
    1369             :         }
    1370         442 : }
    1371             : 
    1372         237 : static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
    1373             : {
    1374         237 :         if (mb_test_bit(*bit + side, bitmap)) {
    1375         159 :                 mb_clear_bit(*bit, bitmap);
    1376         159 :                 (*bit) -= side;
    1377         159 :                 return 1;
    1378             :         }
    1379             :         else {
    1380          78 :                 (*bit) += side;
    1381          78 :                 mb_set_bit(*bit, bitmap);
    1382          78 :                 return -1;
    1383             :         }
    1384             : }
    1385             : 
    1386         104 : static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
    1387             : {
    1388         104 :         int max;
    1389         104 :         int order = 1;
    1390         104 :         void *buddy = mb_find_buddy(e4b, order, &max);
    1391             : 
    1392         293 :         while (buddy) {
    1393         293 :                 void *buddy2;
    1394             : 
    1395             :                 /* Bits in range [first; last] are known to be set since
    1396             :                  * corresponding blocks were allocated. Bits in range
    1397             :                  * (first; last) will stay set because they form buddies on
    1398             :                  * upper layer. We just deal with borders if they don't
    1399             :                  * align with upper layer and then go up.
    1400             :                  * Releasing entire group is all about clearing
    1401             :                  * single bit of highest order buddy.
    1402             :                  */
    1403             : 
    1404             :                 /* Example:
    1405             :                  * ---------------------------------
    1406             :                  * |   1   |   1   |   1   |   1   |
    1407             :                  * ---------------------------------
    1408             :                  * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
    1409             :                  * ---------------------------------
    1410             :                  *   0   1   2   3   4   5   6   7
    1411             :                  *      \_____________________/
    1412             :                  *
    1413             :                  * Neither [1] nor [6] is aligned to above layer.
    1414             :                  * Left neighbour [0] is free, so mark it busy,
    1415             :                  * decrease bb_counters and extend range to
    1416             :                  * [0; 6]
    1417             :                  * Right neighbour [7] is busy. It can't be coaleasced with [6], so
    1418             :                  * mark [6] free, increase bb_counters and shrink range to
    1419             :                  * [0; 5].
    1420             :                  * Then shift range to [0; 2], go up and do the same.
    1421             :                  */
    1422             : 
    1423             : 
    1424         293 :                 if (first & 1)
    1425         110 :                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
    1426         293 :                 if (!(last & 1))
    1427         127 :                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
    1428         293 :                 if (first > last)
    1429             :                         break;
    1430         189 :                 order++;
    1431             : 
    1432         189 :                 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
    1433           0 :                         mb_clear_bits(buddy, first, last - first + 1);
    1434           0 :                         e4b->bd_info->bb_counters[order - 1] += last - first + 1;
    1435           0 :                         break;
    1436             :                 }
    1437         189 :                 first >>= 1;
    1438         189 :                 last >>= 1;
    1439         189 :                 buddy = buddy2;
    1440             :         }
    1441         104 : }
    1442             : 
    1443         166 : static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
    1444             :                            int first, int count)
    1445             : {
    1446         166 :         int left_is_free = 0;
    1447         166 :         int right_is_free = 0;
    1448         166 :         int block;
    1449         166 :         int last = first + count - 1;
    1450         166 :         struct super_block *sb = e4b->bd_sb;
    1451             : 
    1452         166 :         if (WARN_ON(count == 0))
    1453             :                 return;
    1454         166 :         BUG_ON(last >= (sb->s_blocksize << 3));
    1455         166 :         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
    1456             :         /* Don't bother if the block group is corrupt. */
    1457         166 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
    1458             :                 return;
    1459             : 
    1460         166 :         mb_check_buddy(e4b);
    1461         166 :         mb_free_blocks_double(inode, e4b, first, count);
    1462             : 
    1463         166 :         this_cpu_inc(discard_pa_seq);
    1464         166 :         e4b->bd_info->bb_free += count;
    1465         166 :         if (first < e4b->bd_info->bb_first_free)
    1466          10 :                 e4b->bd_info->bb_first_free = first;
    1467             : 
    1468             :         /* access memory sequentially: check left neighbour,
    1469             :          * clear range and then check right neighbour
    1470             :          */
    1471         166 :         if (first != 0)
    1472         166 :                 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
    1473         166 :         block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
    1474         166 :         if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
    1475         166 :                 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
    1476             : 
    1477         166 :         if (unlikely(block != -1)) {
    1478           0 :                 struct ext4_sb_info *sbi = EXT4_SB(sb);
    1479           0 :                 ext4_fsblk_t blocknr;
    1480             : 
    1481           0 :                 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
    1482           0 :                 blocknr += EXT4_C2B(sbi, block);
    1483           0 :                 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
    1484           0 :                         ext4_grp_locked_error(sb, e4b->bd_group,
    1485             :                                               inode ? inode->i_ino : 0,
    1486             :                                               blocknr,
    1487             :                                               "freeing already freed block (bit %u); block bitmap corrupt.",
    1488             :                                               block);
    1489           0 :                         ext4_mark_group_bitmap_corrupted(
    1490             :                                 sb, e4b->bd_group,
    1491             :                                 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    1492             :                 }
    1493           0 :                 goto done;
    1494             :         }
    1495             : 
    1496             :         /* let's maintain fragments counter */
    1497         166 :         if (left_is_free && right_is_free)
    1498          20 :                 e4b->bd_info->bb_fragments--;
    1499         146 :         else if (!left_is_free && !right_is_free)
    1500          79 :                 e4b->bd_info->bb_fragments++;
    1501             : 
    1502             :         /* buddy[0] == bd_bitmap is a special case, so handle
    1503             :          * it right away and let mb_buddy_mark_free stay free of
    1504             :          * zero order checks.
    1505             :          * Check if neighbours are to be coaleasced,
    1506             :          * adjust bitmap bb_counters and borders appropriately.
    1507             :          */
    1508         166 :         if (first & 1) {
    1509          65 :                 first += !left_is_free;
    1510         104 :                 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
    1511             :         }
    1512         166 :         if (!(last & 1)) {
    1513          87 :                 last -= !right_is_free;
    1514         151 :                 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
    1515             :         }
    1516             : 
    1517         166 :         if (first <= last)
    1518         104 :                 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
    1519             : 
    1520          62 : done:
    1521         166 :         mb_set_largest_free_order(sb, e4b->bd_info);
    1522         166 :         mb_check_buddy(e4b);
    1523             : }
    1524             : 
    1525        3071 : static int mb_find_extent(struct ext4_buddy *e4b, int block,
    1526             :                                 int needed, struct ext4_free_extent *ex)
    1527             : {
    1528        3071 :         int next = block;
    1529        3071 :         int max, order;
    1530        3071 :         void *buddy;
    1531             : 
    1532        3071 :         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
    1533        3071 :         BUG_ON(ex == NULL);
    1534             : 
    1535        3071 :         buddy = mb_find_buddy(e4b, 0, &max);
    1536        3071 :         BUG_ON(buddy == NULL);
    1537        3071 :         BUG_ON(block >= max);
    1538        3071 :         if (mb_test_bit(block, buddy)) {
    1539           8 :                 ex->fe_len = 0;
    1540           8 :                 ex->fe_start = 0;
    1541           8 :                 ex->fe_group = 0;
    1542           8 :                 return 0;
    1543             :         }
    1544             : 
    1545             :         /* find actual order */
    1546        3063 :         order = mb_find_order_for_block(e4b, block);
    1547        3063 :         block = block >> order;
    1548             : 
    1549        3063 :         ex->fe_len = 1 << order;
    1550        3063 :         ex->fe_start = block << order;
    1551        3063 :         ex->fe_group = e4b->bd_group;
    1552             : 
    1553             :         /* calc difference from given start */
    1554        3063 :         next = next - ex->fe_start;
    1555        3063 :         ex->fe_len -= next;
    1556        3063 :         ex->fe_start += next;
    1557             : 
    1558        5206 :         while (needed > ex->fe_len &&
    1559        1878 :                mb_find_buddy(e4b, order, &max)) {
    1560             : 
    1561        1878 :                 if (block + 1 >= max)
    1562             :                         break;
    1563             : 
    1564        1878 :                 next = (block + 1) * (1 << order);
    1565        1878 :                 if (mb_test_bit(next, e4b->bd_bitmap))
    1566             :                         break;
    1567             : 
    1568         265 :                 order = mb_find_order_for_block(e4b, next);
    1569             : 
    1570         265 :                 block = next >> order;
    1571         265 :                 ex->fe_len += 1 << order;
    1572             :         }
    1573             : 
    1574        3063 :         if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
    1575             :                 /* Should never happen! (but apparently sometimes does?!?) */
    1576           0 :                 WARN_ON(1);
    1577           0 :                 ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
    1578             :                            "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
    1579             :                            block, order, needed, ex->fe_group, ex->fe_start,
    1580             :                            ex->fe_len, ex->fe_logical);
    1581           0 :                 ex->fe_len = 0;
    1582           0 :                 ex->fe_start = 0;
    1583           0 :                 ex->fe_group = 0;
    1584             :         }
    1585        3063 :         return ex->fe_len;
    1586             : }
    1587             : 
    1588         197 : static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
    1589             : {
    1590         197 :         int ord;
    1591         197 :         int mlen = 0;
    1592         197 :         int max = 0;
    1593         197 :         int cur;
    1594         197 :         int start = ex->fe_start;
    1595         197 :         int len = ex->fe_len;
    1596         197 :         unsigned ret = 0;
    1597         197 :         int len0 = len;
    1598         197 :         void *buddy;
    1599             : 
    1600         197 :         BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
    1601         197 :         BUG_ON(e4b->bd_group != ex->fe_group);
    1602         197 :         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
    1603         197 :         mb_check_buddy(e4b);
    1604         197 :         mb_mark_used_double(e4b, start, len);
    1605             : 
    1606         197 :         this_cpu_inc(discard_pa_seq);
    1607         197 :         e4b->bd_info->bb_free -= len;
    1608         197 :         if (e4b->bd_info->bb_first_free == start)
    1609           6 :                 e4b->bd_info->bb_first_free += len;
    1610             : 
    1611             :         /* let's maintain fragments counter */
    1612         197 :         if (start != 0)
    1613         197 :                 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
    1614         197 :         if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
    1615         197 :                 max = !mb_test_bit(start + len, e4b->bd_bitmap);
    1616         197 :         if (mlen && max)
    1617          16 :                 e4b->bd_info->bb_fragments++;
    1618         181 :         else if (!mlen && !max)
    1619          71 :                 e4b->bd_info->bb_fragments--;
    1620             : 
    1621             :         /* let's maintain buddy itself */
    1622         509 :         while (len) {
    1623         312 :                 ord = mb_find_order_for_block(e4b, start);
    1624             : 
    1625         312 :                 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
    1626             :                         /* the whole chunk may be allocated at once! */
    1627         219 :                         mlen = 1 << ord;
    1628         219 :                         buddy = mb_find_buddy(e4b, ord, &max);
    1629         219 :                         BUG_ON((start >> ord) >= max);
    1630         219 :                         mb_set_bit(start >> ord, buddy);
    1631         219 :                         e4b->bd_info->bb_counters[ord]--;
    1632         219 :                         start += mlen;
    1633         219 :                         len -= mlen;
    1634         219 :                         BUG_ON(len < 0);
    1635         219 :                         continue;
    1636             :                 }
    1637             : 
    1638             :                 /* store for history */
    1639          93 :                 if (ret == 0)
    1640          66 :                         ret = len | (ord << 16);
    1641             : 
    1642             :                 /* we have to split large buddy */
    1643          93 :                 BUG_ON(ord <= 0);
    1644          93 :                 buddy = mb_find_buddy(e4b, ord, &max);
    1645          93 :                 mb_set_bit(start >> ord, buddy);
    1646          93 :                 e4b->bd_info->bb_counters[ord]--;
    1647             : 
    1648          93 :                 ord--;
    1649          93 :                 cur = (start >> ord) & ~1U;
    1650          93 :                 buddy = mb_find_buddy(e4b, ord, &max);
    1651          93 :                 mb_clear_bit(cur, buddy);
    1652          93 :                 mb_clear_bit(cur + 1, buddy);
    1653          93 :                 e4b->bd_info->bb_counters[ord]++;
    1654          93 :                 e4b->bd_info->bb_counters[ord]++;
    1655             :         }
    1656         197 :         mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
    1657             : 
    1658         197 :         ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
    1659         197 :         mb_check_buddy(e4b);
    1660             : 
    1661         197 :         return ret;
    1662             : }
    1663             : 
    1664             : /*
    1665             :  * Must be called under group lock!
    1666             :  */
    1667         197 : static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
    1668             :                                         struct ext4_buddy *e4b)
    1669             : {
    1670         197 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    1671         197 :         int ret;
    1672             : 
    1673         197 :         BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
    1674         197 :         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
    1675             : 
    1676         197 :         ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
    1677         197 :         ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
    1678         197 :         ret = mb_mark_used(e4b, &ac->ac_b_ex);
    1679             : 
    1680             :         /* preallocation can change ac_b_ex, thus we store actually
    1681             :          * allocated blocks for history */
    1682         197 :         ac->ac_f_ex = ac->ac_b_ex;
    1683             : 
    1684         197 :         ac->ac_status = AC_STATUS_FOUND;
    1685         197 :         ac->ac_tail = ret & 0xffff;
    1686         197 :         ac->ac_buddy = ret >> 16;
    1687             : 
    1688             :         /*
    1689             :          * take the page reference. We want the page to be pinned
    1690             :          * so that we don't get a ext4_mb_init_cache_call for this
    1691             :          * group until we update the bitmap. That would mean we
    1692             :          * double allocate blocks. The reference is dropped
    1693             :          * in ext4_mb_release_context
    1694             :          */
    1695         197 :         ac->ac_bitmap_page = e4b->bd_bitmap_page;
    1696         197 :         get_page(ac->ac_bitmap_page);
    1697         197 :         ac->ac_buddy_page = e4b->bd_buddy_page;
    1698         197 :         get_page(ac->ac_buddy_page);
    1699             :         /* store last allocated for subsequent stream allocation */
    1700         197 :         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
    1701           9 :                 spin_lock(&sbi->s_md_lock);
    1702           9 :                 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
    1703           9 :                 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
    1704           9 :                 spin_unlock(&sbi->s_md_lock);
    1705             :         }
    1706             :         /*
    1707             :          * As we've just preallocated more space than
    1708             :          * user requested originally, we store allocated
    1709             :          * space in a special descriptor.
    1710             :          */
    1711         197 :         if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
    1712           6 :                 ext4_mb_new_preallocation(ac);
    1713             : 
    1714         197 : }
    1715             : 
    1716        2900 : static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
    1717             :                                         struct ext4_buddy *e4b,
    1718             :                                         int finish_group)
    1719             : {
    1720        2900 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    1721        2900 :         struct ext4_free_extent *bex = &ac->ac_b_ex;
    1722        2900 :         struct ext4_free_extent *gex = &ac->ac_g_ex;
    1723        2900 :         struct ext4_free_extent ex;
    1724        2900 :         int max;
    1725             : 
    1726        2900 :         if (ac->ac_status == AC_STATUS_FOUND)
    1727        1856 :                 return;
    1728             :         /*
    1729             :          * We don't want to scan for a whole year
    1730             :          */
    1731        2718 :         if (ac->ac_found > sbi->s_mb_max_to_scan &&
    1732          16 :                         !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
    1733          16 :                 ac->ac_status = AC_STATUS_BREAK;
    1734          16 :                 return;
    1735             :         }
    1736             : 
    1737             :         /*
    1738             :          * Haven't found good chunk so far, let's continue
    1739             :          */
    1740        2702 :         if (bex->fe_len < gex->fe_len)
    1741             :                 return;
    1742             : 
    1743        1110 :         if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
    1744          66 :                         && bex->fe_group == e4b->bd_group) {
    1745             :                 /* recheck chunk's availability - we don't know
    1746             :                  * when it was found (within this lock-unlock
    1747             :                  * period or not) */
    1748          66 :                 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
    1749          66 :                 if (max >= gex->fe_len) {
    1750          66 :                         ext4_mb_use_best_found(ac, e4b);
    1751          66 :                         return;
    1752             :                 }
    1753             :         }
    1754             : }
    1755             : 
    1756             : /*
    1757             :  * The routine checks whether found extent is good enough. If it is,
    1758             :  * then the extent gets marked used and flag is set to the context
    1759             :  * to stop scanning. Otherwise, the extent is compared with the
    1760             :  * previous found extent and if new one is better, then it's stored
    1761             :  * in the context. Later, the best found extent will be used, if
    1762             :  * mballoc can't find good enough extent.
    1763             :  *
    1764             :  * FIXME: real allocation policy is to be designed yet!
    1765             :  */
    1766        2989 : static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
    1767             :                                         struct ext4_free_extent *ex,
    1768             :                                         struct ext4_buddy *e4b)
    1769             : {
    1770        2989 :         struct ext4_free_extent *bex = &ac->ac_b_ex;
    1771        2989 :         struct ext4_free_extent *gex = &ac->ac_g_ex;
    1772             : 
    1773        2989 :         BUG_ON(ex->fe_len <= 0);
    1774        2989 :         BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
    1775        2989 :         BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
    1776        2989 :         BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
    1777             : 
    1778        2989 :         ac->ac_found++;
    1779             : 
    1780             :         /*
    1781             :          * The special case - take what you catch first
    1782             :          */
    1783        2989 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
    1784           0 :                 *bex = *ex;
    1785           0 :                 ext4_mb_use_best_found(ac, e4b);
    1786           0 :                 return;
    1787             :         }
    1788             : 
    1789             :         /*
    1790             :          * Let's check whether the chuck is good enough
    1791             :          */
    1792        2989 :         if (ex->fe_len == gex->fe_len) {
    1793         116 :                 *bex = *ex;
    1794         116 :                 ext4_mb_use_best_found(ac, e4b);
    1795         116 :                 return;
    1796             :         }
    1797             : 
    1798             :         /*
    1799             :          * If this is first found extent, just store it in the context
    1800             :          */
    1801        2873 :         if (bex->fe_len == 0) {
    1802         163 :                 *bex = *ex;
    1803         163 :                 return;
    1804             :         }
    1805             : 
    1806             :         /*
    1807             :          * If new found extent is better, store it in the context
    1808             :          */
    1809        2710 :         if (bex->fe_len < gex->fe_len) {
    1810             :                 /* if the request isn't satisfied, any found extent
    1811             :                  * larger than previous best one is better */
    1812        1603 :                 if (ex->fe_len > bex->fe_len)
    1813          16 :                         *bex = *ex;
    1814        1107 :         } else if (ex->fe_len > gex->fe_len) {
    1815             :                 /* if the request is satisfied, then we try to find
    1816             :                  * an extent that still satisfy the request, but is
    1817             :                  * smaller than previous one */
    1818        1105 :                 if (ex->fe_len < bex->fe_len)
    1819         106 :                         *bex = *ex;
    1820             :         }
    1821             : 
    1822        2710 :         ext4_mb_check_limits(ac, e4b, 0);
    1823             : }
    1824             : 
    1825             : static noinline_for_stack
    1826           8 : int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
    1827             :                                         struct ext4_buddy *e4b)
    1828             : {
    1829           8 :         struct ext4_free_extent ex = ac->ac_b_ex;
    1830           8 :         ext4_group_t group = ex.fe_group;
    1831           8 :         int max;
    1832           8 :         int err;
    1833             : 
    1834           8 :         BUG_ON(ex.fe_len <= 0);
    1835           8 :         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
    1836           8 :         if (err)
    1837             :                 return err;
    1838             : 
    1839           8 :         ext4_lock_group(ac->ac_sb, group);
    1840           8 :         max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
    1841             : 
    1842           8 :         if (max > 0) {
    1843           8 :                 ac->ac_b_ex = ex;
    1844           8 :                 ext4_mb_use_best_found(ac, e4b);
    1845             :         }
    1846             : 
    1847           8 :         ext4_unlock_group(ac->ac_sb, group);
    1848           8 :         ext4_mb_unload_buddy(e4b);
    1849             : 
    1850           8 :         return 0;
    1851             : }
    1852             : 
    1853             : static noinline_for_stack
    1854         197 : int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
    1855             :                                 struct ext4_buddy *e4b)
    1856             : {
    1857         197 :         ext4_group_t group = ac->ac_g_ex.fe_group;
    1858         197 :         int max;
    1859         197 :         int err;
    1860         197 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    1861         197 :         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
    1862         197 :         struct ext4_free_extent ex;
    1863             : 
    1864         197 :         if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
    1865             :                 return 0;
    1866           8 :         if (grp->bb_free == 0)
    1867             :                 return 0;
    1868             : 
    1869           8 :         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
    1870           8 :         if (err)
    1871             :                 return err;
    1872             : 
    1873           8 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
    1874           0 :                 ext4_mb_unload_buddy(e4b);
    1875           0 :                 return 0;
    1876             :         }
    1877             : 
    1878           8 :         ext4_lock_group(ac->ac_sb, group);
    1879           8 :         max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
    1880             :                              ac->ac_g_ex.fe_len, &ex);
    1881           8 :         ex.fe_logical = 0xDEADFA11; /* debug value */
    1882             : 
    1883           8 :         if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
    1884           0 :                 ext4_fsblk_t start;
    1885             : 
    1886           0 :                 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
    1887           0 :                         ex.fe_start;
    1888             :                 /* use do_div to get remainder (would be 64-bit modulo) */
    1889           0 :                 if (do_div(start, sbi->s_stripe) == 0) {
    1890           0 :                         ac->ac_found++;
    1891           0 :                         ac->ac_b_ex = ex;
    1892           0 :                         ext4_mb_use_best_found(ac, e4b);
    1893             :                 }
    1894           8 :         } else if (max >= ac->ac_g_ex.fe_len) {
    1895           0 :                 BUG_ON(ex.fe_len <= 0);
    1896           0 :                 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
    1897           0 :                 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
    1898           0 :                 ac->ac_found++;
    1899           0 :                 ac->ac_b_ex = ex;
    1900           0 :                 ext4_mb_use_best_found(ac, e4b);
    1901           8 :         } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
    1902             :                 /* Sometimes, caller may want to merge even small
    1903             :                  * number of blocks to an existing extent */
    1904           0 :                 BUG_ON(ex.fe_len <= 0);
    1905           0 :                 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
    1906           0 :                 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
    1907           0 :                 ac->ac_found++;
    1908           0 :                 ac->ac_b_ex = ex;
    1909           0 :                 ext4_mb_use_best_found(ac, e4b);
    1910             :         }
    1911           8 :         ext4_unlock_group(ac->ac_sb, group);
    1912           8 :         ext4_mb_unload_buddy(e4b);
    1913             : 
    1914           8 :         return 0;
    1915             : }
    1916             : 
    1917             : /*
    1918             :  * The routine scans buddy structures (not bitmap!) from given order
    1919             :  * to max order and tries to find big enough chunk to satisfy the req
    1920             :  */
    1921             : static noinline_for_stack
    1922           7 : void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
    1923             :                                         struct ext4_buddy *e4b)
    1924             : {
    1925           7 :         struct super_block *sb = ac->ac_sb;
    1926           7 :         struct ext4_group_info *grp = e4b->bd_info;
    1927           7 :         void *buddy;
    1928           7 :         int i;
    1929           7 :         int k;
    1930           7 :         int max;
    1931             : 
    1932           7 :         BUG_ON(ac->ac_2order <= 0);
    1933           7 :         for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
    1934           7 :                 if (grp->bb_counters[i] == 0)
    1935           0 :                         continue;
    1936             : 
    1937           7 :                 buddy = mb_find_buddy(e4b, i, &max);
    1938           7 :                 BUG_ON(buddy == NULL);
    1939             : 
    1940           7 :                 k = mb_find_next_zero_bit(buddy, max, 0);
    1941           7 :                 if (k >= max) {
    1942           0 :                         ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
    1943             :                                 "%d free clusters of order %d. But found 0",
    1944             :                                 grp->bb_counters[i], i);
    1945           0 :                         ext4_mark_group_bitmap_corrupted(ac->ac_sb,
    1946             :                                          e4b->bd_group,
    1947             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    1948           0 :                         break;
    1949             :                 }
    1950           7 :                 ac->ac_found++;
    1951             : 
    1952           7 :                 ac->ac_b_ex.fe_len = 1 << i;
    1953           7 :                 ac->ac_b_ex.fe_start = k << i;
    1954           7 :                 ac->ac_b_ex.fe_group = e4b->bd_group;
    1955             : 
    1956           7 :                 ext4_mb_use_best_found(ac, e4b);
    1957             : 
    1958           7 :                 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
    1959             : 
    1960           7 :                 if (EXT4_SB(sb)->s_mb_stats)
    1961           0 :                         atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
    1962             : 
    1963             :                 break;
    1964             :         }
    1965           7 : }
    1966             : 
    1967             : /*
    1968             :  * The routine scans the group and measures all found extents.
    1969             :  * In order to optimize scanning, caller must pass number of
    1970             :  * free blocks in the group, so the routine can know upper limit.
    1971             :  */
    1972             : static noinline_for_stack
    1973         190 : void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
    1974             :                                         struct ext4_buddy *e4b)
    1975             : {
    1976         190 :         struct super_block *sb = ac->ac_sb;
    1977         190 :         void *bitmap = e4b->bd_bitmap;
    1978         190 :         struct ext4_free_extent ex;
    1979         190 :         int i;
    1980         190 :         int free;
    1981             : 
    1982         190 :         free = e4b->bd_info->bb_free;
    1983         190 :         if (WARN_ON(free <= 0))
    1984           0 :                 return;
    1985             : 
    1986         190 :         i = e4b->bd_info->bb_first_free;
    1987             : 
    1988        3179 :         while (free && ac->ac_status == AC_STATUS_CONTINUE) {
    1989        2989 :                 i = mb_find_next_zero_bit(bitmap,
    1990        2989 :                                                 EXT4_CLUSTERS_PER_GROUP(sb), i);
    1991        2989 :                 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
    1992             :                         /*
    1993             :                          * IF we have corrupt bitmap, we won't find any
    1994             :                          * free blocks even though group info says we
    1995             :                          * have free blocks
    1996             :                          */
    1997           0 :                         ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
    1998             :                                         "%d free clusters as per "
    1999             :                                         "group info. But bitmap says 0",
    2000             :                                         free);
    2001           0 :                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
    2002             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    2003           0 :                         break;
    2004             :                 }
    2005             : 
    2006        2989 :                 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
    2007        2989 :                 if (WARN_ON(ex.fe_len <= 0))
    2008             :                         break;
    2009        2989 :                 if (free < ex.fe_len) {
    2010           0 :                         ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
    2011             :                                         "%d free clusters as per "
    2012             :                                         "group info. But got %d blocks",
    2013             :                                         free, ex.fe_len);
    2014           0 :                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
    2015             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    2016             :                         /*
    2017             :                          * The number of free blocks differs. This mostly
    2018             :                          * indicate that the bitmap is corrupt. So exit
    2019             :                          * without claiming the space.
    2020             :                          */
    2021           0 :                         break;
    2022             :                 }
    2023        2989 :                 ex.fe_logical = 0xDEADC0DE; /* debug value */
    2024        2989 :                 ext4_mb_measure_extent(ac, &ex, e4b);
    2025             : 
    2026        2989 :                 i += ex.fe_len;
    2027        2989 :                 free -= ex.fe_len;
    2028             :         }
    2029             : 
    2030         190 :         ext4_mb_check_limits(ac, e4b, 1);
    2031             : }
    2032             : 
    2033             : /*
    2034             :  * This is a special case for storages like raid5
    2035             :  * we try to find stripe-aligned chunks for stripe-size-multiple requests
    2036             :  */
    2037             : static noinline_for_stack
    2038           0 : void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
    2039             :                                  struct ext4_buddy *e4b)
    2040             : {
    2041           0 :         struct super_block *sb = ac->ac_sb;
    2042           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2043           0 :         void *bitmap = e4b->bd_bitmap;
    2044           0 :         struct ext4_free_extent ex;
    2045           0 :         ext4_fsblk_t first_group_block;
    2046           0 :         ext4_fsblk_t a;
    2047           0 :         ext4_grpblk_t i;
    2048           0 :         int max;
    2049             : 
    2050           0 :         BUG_ON(sbi->s_stripe == 0);
    2051             : 
    2052             :         /* find first stripe-aligned block in group */
    2053           0 :         first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
    2054             : 
    2055           0 :         a = first_group_block + sbi->s_stripe - 1;
    2056           0 :         do_div(a, sbi->s_stripe);
    2057           0 :         i = (a * sbi->s_stripe) - first_group_block;
    2058             : 
    2059           0 :         while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
    2060           0 :                 if (!mb_test_bit(i, bitmap)) {
    2061           0 :                         max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
    2062           0 :                         if (max >= sbi->s_stripe) {
    2063           0 :                                 ac->ac_found++;
    2064           0 :                                 ex.fe_logical = 0xDEADF00D; /* debug value */
    2065           0 :                                 ac->ac_b_ex = ex;
    2066           0 :                                 ext4_mb_use_best_found(ac, e4b);
    2067           0 :                                 break;
    2068             :                         }
    2069             :                 }
    2070           0 :                 i += sbi->s_stripe;
    2071             :         }
    2072           0 : }
    2073             : 
    2074             : /*
    2075             :  * This is also called BEFORE we load the buddy bitmap.
    2076             :  * Returns either 1 or 0 indicating that the group is either suitable
    2077             :  * for the allocation or not.
    2078             :  */
    2079         585 : static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
    2080             :                                 ext4_group_t group, int cr)
    2081             : {
    2082         585 :         ext4_grpblk_t free, fragments;
    2083         585 :         int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
    2084         585 :         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
    2085             : 
    2086         585 :         BUG_ON(cr < 0 || cr >= 4);
    2087             : 
    2088         585 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
    2089             :                 return false;
    2090             : 
    2091         585 :         free = grp->bb_free;
    2092         585 :         if (free == 0)
    2093             :                 return false;
    2094             : 
    2095         585 :         fragments = grp->bb_fragments;
    2096         585 :         if (fragments == 0)
    2097             :                 return false;
    2098             : 
    2099         585 :         switch (cr) {
    2100          71 :         case 0:
    2101          71 :                 BUG_ON(ac->ac_2order == 0);
    2102             : 
    2103             :                 /* Avoid using the first bg of a flexgroup for data files */
    2104          71 :                 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
    2105          71 :                     (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
    2106          71 :                     ((group % flex_size) == 0))
    2107             :                         return false;
    2108             : 
    2109          70 :                 if (free < ac->ac_g_ex.fe_len)
    2110             :                         return false;
    2111             : 
    2112          70 :                 if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
    2113             :                         return true;
    2114             : 
    2115          70 :                 if (grp->bb_largest_free_order < ac->ac_2order)
    2116          56 :                         return false;
    2117             : 
    2118             :                 return true;
    2119         498 :         case 1:
    2120         498 :                 if ((free / fragments) >= ac->ac_g_ex.fe_len)
    2121         364 :                         return true;
    2122             :                 break;
    2123          16 :         case 2:
    2124          16 :                 if (free >= ac->ac_g_ex.fe_len)
    2125          16 :                         return true;
    2126             :                 break;
    2127             :         case 3:
    2128             :                 return true;
    2129           0 :         default:
    2130           0 :                 BUG();
    2131             :         }
    2132             : 
    2133             :         return false;
    2134             : }
    2135             : 
    2136             : /*
    2137             :  * This could return negative error code if something goes wrong
    2138             :  * during ext4_mb_init_group(). This should not be called with
    2139             :  * ext4_lock_group() held.
    2140             :  */
    2141         390 : static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
    2142             :                                      ext4_group_t group, int cr)
    2143             : {
    2144         390 :         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
    2145         390 :         struct super_block *sb = ac->ac_sb;
    2146         390 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2147         390 :         bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
    2148         390 :         ext4_grpblk_t free;
    2149         390 :         int ret = 0;
    2150             : 
    2151         390 :         if (should_lock)
    2152           0 :                 ext4_lock_group(sb, group);
    2153         390 :         free = grp->bb_free;
    2154         390 :         if (free == 0)
    2155           0 :                 goto out;
    2156         390 :         if (cr <= 2 && free < ac->ac_g_ex.fe_len)
    2157           2 :                 goto out;
    2158         388 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
    2159           0 :                 goto out;
    2160         388 :         if (should_lock)
    2161           0 :                 ext4_unlock_group(sb, group);
    2162             : 
    2163             :         /* We only do this if the grp has never been initialized */
    2164         388 :         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
    2165           0 :                 struct ext4_group_desc *gdp =
    2166           0 :                         ext4_get_group_desc(sb, group, NULL);
    2167           0 :                 int ret;
    2168             : 
    2169             :                 /* cr=0/1 is a very optimistic search to find large
    2170             :                  * good chunks almost for free.  If buddy data is not
    2171             :                  * ready, then this optimization makes no sense.  But
    2172             :                  * we never skip the first block group in a flex_bg,
    2173             :                  * since this gets used for metadata block allocation,
    2174             :                  * and we want to make sure we locate metadata blocks
    2175             :                  * in the first block group in the flex_bg if possible.
    2176             :                  */
    2177           0 :                 if (cr < 2 &&
    2178           0 :                     (!sbi->s_log_groups_per_flex ||
    2179           0 :                      ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
    2180           0 :                     !(ext4_has_group_desc_csum(sb) &&
    2181           0 :                       (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
    2182             :                         return 0;
    2183           0 :                 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
    2184           0 :                 if (ret)
    2185             :                         return ret;
    2186             :         }
    2187             : 
    2188         388 :         if (should_lock)
    2189           0 :                 ext4_lock_group(sb, group);
    2190         388 :         ret = ext4_mb_good_group(ac, group, cr);
    2191         390 : out:
    2192         390 :         if (should_lock)
    2193           0 :                 ext4_unlock_group(sb, group);
    2194             :         return ret;
    2195             : }
    2196             : 
    2197             : /*
    2198             :  * Start prefetching @nr block bitmaps starting at @group.
    2199             :  * Return the next group which needs to be prefetched.
    2200             :  */
    2201         213 : ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
    2202             :                               unsigned int nr, int *cnt)
    2203             : {
    2204         213 :         ext4_group_t ngroups = ext4_get_groups_count(sb);
    2205         213 :         struct buffer_head *bh;
    2206         213 :         struct blk_plug plug;
    2207             : 
    2208         213 :         blk_start_plug(&plug);
    2209        3497 :         while (nr-- > 0) {
    2210        3284 :                 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
    2211             :                                                                   NULL);
    2212        3284 :                 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
    2213             : 
    2214             :                 /*
    2215             :                  * Prefetch block groups with free blocks; but don't
    2216             :                  * bother if it is marked uninitialized on disk, since
    2217             :                  * it won't require I/O to read.  Also only try to
    2218             :                  * prefetch once, so we avoid getblk() call, which can
    2219             :                  * be expensive.
    2220             :                  */
    2221        3284 :                 if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
    2222          31 :                     EXT4_MB_GRP_NEED_INIT(grp) &&
    2223          30 :                     ext4_free_group_clusters(sb, gdp) > 0 &&
    2224          15 :                     !(ext4_has_group_desc_csum(sb) &&
    2225          15 :                       (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
    2226          15 :                         bh = ext4_read_block_bitmap_nowait(sb, group, true);
    2227          15 :                         if (bh && !IS_ERR(bh)) {
    2228          15 :                                 if (!buffer_uptodate(bh) && cnt)
    2229          15 :                                         (*cnt)++;
    2230          15 :                                 brelse(bh);
    2231             :                         }
    2232             :                 }
    2233        3284 :                 if (++group >= ngroups)
    2234         213 :                         group = 0;
    2235             :         }
    2236         213 :         blk_finish_plug(&plug);
    2237         213 :         return group;
    2238             : }
    2239             : 
    2240             : /*
    2241             :  * Prefetching reads the block bitmap into the buffer cache; but we
    2242             :  * need to make sure that the buddy bitmap in the page cache has been
    2243             :  * initialized.  Note that ext4_mb_init_group() will block if the I/O
    2244             :  * is not yet completed, or indeed if it was not initiated by
    2245             :  * ext4_mb_prefetch did not start the I/O.
    2246             :  *
    2247             :  * TODO: We should actually kick off the buddy bitmap setup in a work
    2248             :  * queue when the buffer I/O is completed, so that we don't block
    2249             :  * waiting for the block allocation bitmap read to finish when
    2250             :  * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
    2251             :  */
    2252           1 : void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
    2253             :                            unsigned int nr)
    2254             : {
    2255          17 :         while (nr-- > 0) {
    2256          16 :                 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
    2257             :                                                                   NULL);
    2258          16 :                 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
    2259             : 
    2260          16 :                 if (!group)
    2261           1 :                         group = ext4_get_groups_count(sb);
    2262          16 :                 group--;
    2263          16 :                 grp = ext4_get_group_info(sb, group);
    2264             : 
    2265          31 :                 if (EXT4_MB_GRP_NEED_INIT(grp) &&
    2266          30 :                     ext4_free_group_clusters(sb, gdp) > 0 &&
    2267          15 :                     !(ext4_has_group_desc_csum(sb) &&
    2268          15 :                       (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
    2269          15 :                         if (ext4_mb_init_group(sb, group, GFP_NOFS))
    2270             :                                 break;
    2271             :                 }
    2272             :         }
    2273           1 : }
    2274             : 
    2275             : static noinline_for_stack int
    2276         197 : ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
    2277             : {
    2278         197 :         ext4_group_t prefetch_grp = 0, ngroups, group, i;
    2279         197 :         int cr = -1;
    2280         197 :         int err = 0, first_err = 0;
    2281         197 :         unsigned int nr = 0, prefetch_ios = 0;
    2282         197 :         struct ext4_sb_info *sbi;
    2283         197 :         struct super_block *sb;
    2284         197 :         struct ext4_buddy e4b;
    2285         197 :         int lost;
    2286             : 
    2287         197 :         sb = ac->ac_sb;
    2288         197 :         sbi = EXT4_SB(sb);
    2289         197 :         ngroups = ext4_get_groups_count(sb);
    2290             :         /* non-extent files are limited to low blocks/groups */
    2291         197 :         if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
    2292           0 :                 ngroups = sbi->s_blockfile_groups;
    2293             : 
    2294         197 :         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
    2295             : 
    2296             :         /* first, try the goal */
    2297         197 :         err = ext4_mb_find_by_goal(ac, &e4b);
    2298         197 :         if (err || ac->ac_status == AC_STATUS_FOUND)
    2299           0 :                 goto out;
    2300             : 
    2301         197 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
    2302           0 :                 goto out;
    2303             : 
    2304             :         /*
    2305             :          * ac->ac_2order is set only if the fe_len is a power of 2
    2306             :          * if ac->ac_2order is set we also set criteria to 0 so that we
    2307             :          * try exact allocation using buddy.
    2308             :          */
    2309         197 :         i = fls(ac->ac_g_ex.fe_len);
    2310         197 :         ac->ac_2order = 0;
    2311             :         /*
    2312             :          * We search using buddy data only if the order of the request
    2313             :          * is greater than equal to the sbi_s_mb_order2_reqs
    2314             :          * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
    2315             :          * We also support searching for power-of-two requests only for
    2316             :          * requests upto maximum buddy size we have constructed.
    2317             :          */
    2318         197 :         if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
    2319             :                 /*
    2320             :                  * This should tell if fe_len is exactly power of 2
    2321             :                  */
    2322          20 :                 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
    2323           7 :                         ac->ac_2order = array_index_nospec(i - 1,
    2324             :                                                            sb->s_blocksize_bits + 2);
    2325             :         }
    2326             : 
    2327             :         /* if stream allocation is enabled, use global goal */
    2328         197 :         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
    2329             :                 /* TBD: may be hot point */
    2330           9 :                 spin_lock(&sbi->s_md_lock);
    2331           9 :                 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
    2332           9 :                 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
    2333           9 :                 spin_unlock(&sbi->s_md_lock);
    2334             :         }
    2335             : 
    2336             :         /* Let's just scan groups to find more-less suitable blocks */
    2337         197 :         cr = ac->ac_2order ? 0 : 1;
    2338             :         /*
    2339             :          * cr == 0 try to get exact allocation,
    2340             :          * cr == 3  try to get anything
    2341             :          */
    2342             : repeat:
    2343         607 :         for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
    2344         205 :                 ac->ac_criteria = cr;
    2345             :                 /*
    2346             :                  * searching for the right group start
    2347             :                  * from the goal value specified
    2348             :                  */
    2349         205 :                 group = ac->ac_g_ex.fe_group;
    2350         205 :                 prefetch_grp = group;
    2351             : 
    2352         398 :                 for (i = 0; i < ngroups; group++, i++) {
    2353         390 :                         int ret = 0;
    2354         390 :                         cond_resched();
    2355             :                         /*
    2356             :                          * Artificially restricted ngroups for non-extent
    2357             :                          * files makes group > ngroups possible on first loop.
    2358             :                          */
    2359         390 :                         if (group >= ngroups)
    2360           8 :                                 group = 0;
    2361             : 
    2362             :                         /*
    2363             :                          * Batch reads of the block allocation bitmaps
    2364             :                          * to get multiple READs in flight; limit
    2365             :                          * prefetching at cr=0/1, otherwise mballoc can
    2366             :                          * spend a lot of time loading imperfect groups
    2367             :                          */
    2368         390 :                         if ((prefetch_grp == group) &&
    2369         205 :                             (cr > 1 ||
    2370         205 :                              prefetch_ios < sbi->s_mb_prefetch_limit)) {
    2371         213 :                                 unsigned int curr_ios = prefetch_ios;
    2372             : 
    2373         213 :                                 nr = sbi->s_mb_prefetch;
    2374         213 :                                 if (ext4_has_feature_flex_bg(sb)) {
    2375         213 :                                         nr = 1 << sbi->s_log_groups_per_flex;
    2376         213 :                                         nr -= group & (nr - 1);
    2377         213 :                                         nr = min(nr, sbi->s_mb_prefetch);
    2378             :                                 }
    2379         213 :                                 prefetch_grp = ext4_mb_prefetch(sb, group,
    2380             :                                                         nr, &prefetch_ios);
    2381         213 :                                 if (prefetch_ios == curr_ios)
    2382         212 :                                         nr = 0;
    2383             :                         }
    2384             : 
    2385             :                         /* This now checks without needing the buddy page */
    2386         390 :                         ret = ext4_mb_good_group_nolock(ac, group, cr);
    2387         390 :                         if (ret <= 0) {
    2388         193 :                                 if (!first_err)
    2389         193 :                                         first_err = ret;
    2390         193 :                                 continue;
    2391             :                         }
    2392             : 
    2393         197 :                         err = ext4_mb_load_buddy(sb, group, &e4b);
    2394         197 :                         if (err)
    2395           0 :                                 goto out;
    2396             : 
    2397         197 :                         ext4_lock_group(sb, group);
    2398             : 
    2399             :                         /*
    2400             :                          * We need to check again after locking the
    2401             :                          * block group
    2402             :                          */
    2403         197 :                         ret = ext4_mb_good_group(ac, group, cr);
    2404         197 :                         if (ret == 0) {
    2405           0 :                                 ext4_unlock_group(sb, group);
    2406           0 :                                 ext4_mb_unload_buddy(&e4b);
    2407           0 :                                 continue;
    2408             :                         }
    2409             : 
    2410         197 :                         ac->ac_groups_scanned++;
    2411         197 :                         if (cr == 0)
    2412           7 :                                 ext4_mb_simple_scan_group(ac, &e4b);
    2413         190 :                         else if (cr == 1 && sbi->s_stripe &&
    2414           0 :                                         !(ac->ac_g_ex.fe_len % sbi->s_stripe))
    2415           0 :                                 ext4_mb_scan_aligned(ac, &e4b);
    2416             :                         else
    2417         190 :                                 ext4_mb_complex_scan_group(ac, &e4b);
    2418             : 
    2419         197 :                         ext4_unlock_group(sb, group);
    2420         197 :                         ext4_mb_unload_buddy(&e4b);
    2421             : 
    2422         197 :                         if (ac->ac_status != AC_STATUS_CONTINUE)
    2423             :                                 break;
    2424             :                 }
    2425             :         }
    2426             : 
    2427         197 :         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
    2428           8 :             !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
    2429             :                 /*
    2430             :                  * We've been searching too long. Let's try to allocate
    2431             :                  * the best chunk we've found so far
    2432             :                  */
    2433           8 :                 ext4_mb_try_best_found(ac, &e4b);
    2434           8 :                 if (ac->ac_status != AC_STATUS_FOUND) {
    2435             :                         /*
    2436             :                          * Someone more lucky has already allocated it.
    2437             :                          * The only thing we can do is just take first
    2438             :                          * found block(s)
    2439             :                          */
    2440           0 :                         lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
    2441           0 :                         mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
    2442             :                                  ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
    2443             :                                  ac->ac_b_ex.fe_len, lost);
    2444             : 
    2445           0 :                         ac->ac_b_ex.fe_group = 0;
    2446           0 :                         ac->ac_b_ex.fe_start = 0;
    2447           0 :                         ac->ac_b_ex.fe_len = 0;
    2448           0 :                         ac->ac_status = AC_STATUS_CONTINUE;
    2449           0 :                         ac->ac_flags |= EXT4_MB_HINT_FIRST;
    2450           0 :                         cr = 3;
    2451           0 :                         goto repeat;
    2452             :                 }
    2453             :         }
    2454         197 : out:
    2455         197 :         if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
    2456           0 :                 err = first_err;
    2457             : 
    2458         197 :         mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
    2459             :                  ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
    2460             :                  ac->ac_flags, cr, err);
    2461             : 
    2462         197 :         if (nr)
    2463           1 :                 ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
    2464             : 
    2465         197 :         return err;
    2466             : }
    2467             : 
    2468           0 : static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
    2469             : {
    2470           0 :         struct super_block *sb = PDE_DATA(file_inode(seq->file));
    2471           0 :         ext4_group_t group;
    2472             : 
    2473           0 :         if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
    2474           0 :                 return NULL;
    2475           0 :         group = *pos + 1;
    2476           0 :         return (void *) ((unsigned long) group);
    2477             : }
    2478             : 
    2479           0 : static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
    2480             : {
    2481           0 :         struct super_block *sb = PDE_DATA(file_inode(seq->file));
    2482           0 :         ext4_group_t group;
    2483             : 
    2484           0 :         ++*pos;
    2485           0 :         if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
    2486           0 :                 return NULL;
    2487           0 :         group = *pos + 1;
    2488           0 :         return (void *) ((unsigned long) group);
    2489             : }
    2490             : 
    2491           0 : static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
    2492             : {
    2493           0 :         struct super_block *sb = PDE_DATA(file_inode(seq->file));
    2494           0 :         ext4_group_t group = (ext4_group_t) ((unsigned long) v);
    2495           0 :         int i;
    2496           0 :         int err, buddy_loaded = 0;
    2497           0 :         struct ext4_buddy e4b;
    2498           0 :         struct ext4_group_info *grinfo;
    2499           0 :         unsigned char blocksize_bits = min_t(unsigned char,
    2500             :                                              sb->s_blocksize_bits,
    2501             :                                              EXT4_MAX_BLOCK_LOG_SIZE);
    2502           0 :         struct sg {
    2503             :                 struct ext4_group_info info;
    2504             :                 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
    2505             :         } sg;
    2506             : 
    2507           0 :         group--;
    2508           0 :         if (group == 0)
    2509           0 :                 seq_puts(seq, "#group: free  frags first ["
    2510             :                               " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
    2511             :                               " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
    2512             : 
    2513           0 :         i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
    2514             :                 sizeof(struct ext4_group_info);
    2515             : 
    2516           0 :         grinfo = ext4_get_group_info(sb, group);
    2517             :         /* Load the group info in memory only if not already loaded. */
    2518           0 :         if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
    2519           0 :                 err = ext4_mb_load_buddy(sb, group, &e4b);
    2520           0 :                 if (err) {
    2521           0 :                         seq_printf(seq, "#%-5u: I/O error\n", group);
    2522           0 :                         return 0;
    2523             :                 }
    2524             :                 buddy_loaded = 1;
    2525             :         }
    2526             : 
    2527           0 :         memcpy(&sg, ext4_get_group_info(sb, group), i);
    2528             : 
    2529           0 :         if (buddy_loaded)
    2530           0 :                 ext4_mb_unload_buddy(&e4b);
    2531             : 
    2532           0 :         seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
    2533             :                         sg.info.bb_fragments, sg.info.bb_first_free);
    2534           0 :         for (i = 0; i <= 13; i++)
    2535           0 :                 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
    2536             :                                 sg.info.bb_counters[i] : 0);
    2537           0 :         seq_puts(seq, " ]\n");
    2538             : 
    2539           0 :         return 0;
    2540             : }
    2541             : 
    2542           0 : static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
    2543             : {
    2544           0 : }
    2545             : 
    2546             : const struct seq_operations ext4_mb_seq_groups_ops = {
    2547             :         .start  = ext4_mb_seq_groups_start,
    2548             :         .next   = ext4_mb_seq_groups_next,
    2549             :         .stop   = ext4_mb_seq_groups_stop,
    2550             :         .show   = ext4_mb_seq_groups_show,
    2551             : };
    2552             : 
    2553          16 : static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
    2554             : {
    2555          16 :         int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
    2556          16 :         struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
    2557             : 
    2558          16 :         BUG_ON(!cachep);
    2559          16 :         return cachep;
    2560             : }
    2561             : 
    2562             : /*
    2563             :  * Allocate the top-level s_group_info array for the specified number
    2564             :  * of groups
    2565             :  */
    2566           1 : int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
    2567             : {
    2568           1 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2569           1 :         unsigned size;
    2570           1 :         struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
    2571             : 
    2572           1 :         size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
    2573           1 :                 EXT4_DESC_PER_BLOCK_BITS(sb);
    2574           1 :         if (size <= sbi->s_group_info_size)
    2575             :                 return 0;
    2576             : 
    2577           1 :         size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
    2578           1 :         new_groupinfo = kvzalloc(size, GFP_KERNEL);
    2579           1 :         if (!new_groupinfo) {
    2580           0 :                 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
    2581           0 :                 return -ENOMEM;
    2582             :         }
    2583           1 :         rcu_read_lock();
    2584           1 :         old_groupinfo = rcu_dereference(sbi->s_group_info);
    2585           1 :         if (old_groupinfo)
    2586           0 :                 memcpy(new_groupinfo, old_groupinfo,
    2587           0 :                        sbi->s_group_info_size * sizeof(*sbi->s_group_info));
    2588           1 :         rcu_read_unlock();
    2589           1 :         rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
    2590           1 :         sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
    2591           1 :         if (old_groupinfo)
    2592           0 :                 ext4_kvfree_array_rcu(old_groupinfo);
    2593             :         ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
    2594             :                    sbi->s_group_info_size);
    2595             :         return 0;
    2596             : }
    2597             : 
    2598             : /* Create and initialize ext4_group_info data for the given group. */
    2599          16 : int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
    2600             :                           struct ext4_group_desc *desc)
    2601             : {
    2602          16 :         int i;
    2603          16 :         int metalen = 0;
    2604          16 :         int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
    2605          16 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2606          16 :         struct ext4_group_info **meta_group_info;
    2607          16 :         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
    2608             : 
    2609             :         /*
    2610             :          * First check if this group is the first of a reserved block.
    2611             :          * If it's true, we have to allocate a new table of pointers
    2612             :          * to ext4_group_info structures
    2613             :          */
    2614          16 :         if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
    2615           1 :                 metalen = sizeof(*meta_group_info) <<
    2616           1 :                         EXT4_DESC_PER_BLOCK_BITS(sb);
    2617           1 :                 meta_group_info = kmalloc(metalen, GFP_NOFS);
    2618           1 :                 if (meta_group_info == NULL) {
    2619           0 :                         ext4_msg(sb, KERN_ERR, "can't allocate mem "
    2620             :                                  "for a buddy group");
    2621           0 :                         goto exit_meta_group_info;
    2622             :                 }
    2623           1 :                 rcu_read_lock();
    2624           1 :                 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
    2625           1 :                 rcu_read_unlock();
    2626             :         }
    2627             : 
    2628          32 :         meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
    2629          16 :         i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
    2630             : 
    2631          16 :         meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
    2632          16 :         if (meta_group_info[i] == NULL) {
    2633           0 :                 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
    2634           0 :                 goto exit_group_info;
    2635             :         }
    2636          16 :         set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
    2637          16 :                 &(meta_group_info[i]->bb_state));
    2638             : 
    2639             :         /*
    2640             :          * initialize bb_free to be able to skip
    2641             :          * empty groups without initialization
    2642             :          */
    2643          16 :         if (ext4_has_group_desc_csum(sb) &&
    2644          16 :             (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    2645           0 :                 meta_group_info[i]->bb_free =
    2646           0 :                         ext4_free_clusters_after_init(sb, group, desc);
    2647             :         } else {
    2648          32 :                 meta_group_info[i]->bb_free =
    2649          16 :                         ext4_free_group_clusters(sb, desc);
    2650             :         }
    2651             : 
    2652          16 :         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
    2653          16 :         init_rwsem(&meta_group_info[i]->alloc_sem);
    2654          16 :         meta_group_info[i]->bb_free_root = RB_ROOT;
    2655          16 :         meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
    2656             : 
    2657          16 :         mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
    2658          16 :         return 0;
    2659             : 
    2660           0 : exit_group_info:
    2661             :         /* If a meta_group_info table has been allocated, release it now */
    2662           0 :         if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
    2663           0 :                 struct ext4_group_info ***group_info;
    2664             : 
    2665           0 :                 rcu_read_lock();
    2666           0 :                 group_info = rcu_dereference(sbi->s_group_info);
    2667           0 :                 kfree(group_info[idx]);
    2668           0 :                 group_info[idx] = NULL;
    2669           0 :                 rcu_read_unlock();
    2670             :         }
    2671           0 : exit_meta_group_info:
    2672             :         return -ENOMEM;
    2673             : } /* ext4_mb_add_groupinfo */
    2674             : 
    2675           1 : static int ext4_mb_init_backend(struct super_block *sb)
    2676             : {
    2677           1 :         ext4_group_t ngroups = ext4_get_groups_count(sb);
    2678           1 :         ext4_group_t i;
    2679           1 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2680           1 :         int err;
    2681           1 :         struct ext4_group_desc *desc;
    2682           1 :         struct ext4_group_info ***group_info;
    2683           1 :         struct kmem_cache *cachep;
    2684             : 
    2685           1 :         err = ext4_mb_alloc_groupinfo(sb, ngroups);
    2686           1 :         if (err)
    2687             :                 return err;
    2688             : 
    2689           1 :         sbi->s_buddy_cache = new_inode(sb);
    2690           1 :         if (sbi->s_buddy_cache == NULL) {
    2691           0 :                 ext4_msg(sb, KERN_ERR, "can't get new inode");
    2692           0 :                 goto err_freesgi;
    2693             :         }
    2694             :         /* To avoid potentially colliding with an valid on-disk inode number,
    2695             :          * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
    2696             :          * not in the inode hash, so it should never be found by iget(), but
    2697             :          * this will avoid confusion if it ever shows up during debugging. */
    2698           1 :         sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
    2699           1 :         EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
    2700          17 :         for (i = 0; i < ngroups; i++) {
    2701          16 :                 cond_resched();
    2702          16 :                 desc = ext4_get_group_desc(sb, i, NULL);
    2703          16 :                 if (desc == NULL) {
    2704           0 :                         ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
    2705           0 :                         goto err_freebuddy;
    2706             :                 }
    2707          16 :                 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
    2708           0 :                         goto err_freebuddy;
    2709             :         }
    2710             : 
    2711           1 :         if (ext4_has_feature_flex_bg(sb)) {
    2712             :                 /* a single flex group is supposed to be read by a single IO */
    2713           1 :                 sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
    2714             :                         BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
    2715           1 :                 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
    2716             :         } else {
    2717           0 :                 sbi->s_mb_prefetch = 32;
    2718             :         }
    2719           1 :         if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
    2720           1 :                 sbi->s_mb_prefetch = ext4_get_groups_count(sb);
    2721             :         /* now many real IOs to prefetch within a single allocation at cr=0
    2722             :          * given cr=0 is an CPU-related optimization we shouldn't try to
    2723             :          * load too many groups, at some point we should start to use what
    2724             :          * we've got in memory.
    2725             :          * with an average random access time 5ms, it'd take a second to get
    2726             :          * 200 groups (* N with flex_bg), so let's make this limit 4
    2727             :          */
    2728           1 :         sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
    2729           1 :         if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
    2730           1 :                 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
    2731             : 
    2732             :         return 0;
    2733             : 
    2734           0 : err_freebuddy:
    2735           0 :         cachep = get_groupinfo_cache(sb->s_blocksize_bits);
    2736           0 :         while (i-- > 0)
    2737           0 :                 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
    2738           0 :         i = sbi->s_group_info_size;
    2739           0 :         rcu_read_lock();
    2740           0 :         group_info = rcu_dereference(sbi->s_group_info);
    2741           0 :         while (i-- > 0)
    2742           0 :                 kfree(group_info[i]);
    2743           0 :         rcu_read_unlock();
    2744           0 :         iput(sbi->s_buddy_cache);
    2745           0 : err_freesgi:
    2746           0 :         rcu_read_lock();
    2747           0 :         kvfree(rcu_dereference(sbi->s_group_info));
    2748           0 :         rcu_read_unlock();
    2749           0 :         return -ENOMEM;
    2750             : }
    2751             : 
    2752           0 : static void ext4_groupinfo_destroy_slabs(void)
    2753             : {
    2754           0 :         int i;
    2755             : 
    2756           0 :         for (i = 0; i < NR_GRPINFO_CACHES; i++) {
    2757           0 :                 kmem_cache_destroy(ext4_groupinfo_caches[i]);
    2758           0 :                 ext4_groupinfo_caches[i] = NULL;
    2759             :         }
    2760           0 : }
    2761             : 
    2762           1 : static int ext4_groupinfo_create_slab(size_t size)
    2763             : {
    2764           1 :         static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
    2765           1 :         int slab_size;
    2766           1 :         int blocksize_bits = order_base_2(size);
    2767           1 :         int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
    2768           1 :         struct kmem_cache *cachep;
    2769             : 
    2770           1 :         if (cache_index >= NR_GRPINFO_CACHES)
    2771             :                 return -EINVAL;
    2772             : 
    2773           1 :         if (unlikely(cache_index < 0))
    2774           0 :                 cache_index = 0;
    2775             : 
    2776           1 :         mutex_lock(&ext4_grpinfo_slab_create_mutex);
    2777           1 :         if (ext4_groupinfo_caches[cache_index]) {
    2778           0 :                 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
    2779           0 :                 return 0;       /* Already created */
    2780             :         }
    2781             : 
    2782           1 :         slab_size = offsetof(struct ext4_group_info,
    2783             :                                 bb_counters[blocksize_bits + 2]);
    2784             : 
    2785           1 :         cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
    2786             :                                         slab_size, 0, SLAB_RECLAIM_ACCOUNT,
    2787             :                                         NULL);
    2788             : 
    2789           1 :         ext4_groupinfo_caches[cache_index] = cachep;
    2790             : 
    2791           1 :         mutex_unlock(&ext4_grpinfo_slab_create_mutex);
    2792           1 :         if (!cachep) {
    2793           0 :                 printk(KERN_EMERG
    2794             :                        "EXT4-fs: no memory for groupinfo slab cache\n");
    2795           0 :                 return -ENOMEM;
    2796             :         }
    2797             : 
    2798             :         return 0;
    2799             : }
    2800             : 
    2801           1 : int ext4_mb_init(struct super_block *sb)
    2802             : {
    2803           1 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2804           1 :         unsigned i, j;
    2805           1 :         unsigned offset, offset_incr;
    2806           1 :         unsigned max;
    2807           1 :         int ret;
    2808             : 
    2809           1 :         i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
    2810             : 
    2811           1 :         sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
    2812           1 :         if (sbi->s_mb_offsets == NULL) {
    2813           0 :                 ret = -ENOMEM;
    2814           0 :                 goto out;
    2815             :         }
    2816             : 
    2817           1 :         i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
    2818           1 :         sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
    2819           1 :         if (sbi->s_mb_maxs == NULL) {
    2820           0 :                 ret = -ENOMEM;
    2821           0 :                 goto out;
    2822             :         }
    2823             : 
    2824           1 :         ret = ext4_groupinfo_create_slab(sb->s_blocksize);
    2825           1 :         if (ret < 0)
    2826           0 :                 goto out;
    2827             : 
    2828             :         /* order 0 is regular bitmap */
    2829           1 :         sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
    2830           1 :         sbi->s_mb_offsets[0] = 0;
    2831             : 
    2832           1 :         i = 1;
    2833           1 :         offset = 0;
    2834           1 :         offset_incr = 1 << (sb->s_blocksize_bits - 1);
    2835           1 :         max = sb->s_blocksize << 2;
    2836          13 :         do {
    2837          13 :                 sbi->s_mb_offsets[i] = offset;
    2838          13 :                 sbi->s_mb_maxs[i] = max;
    2839          13 :                 offset += offset_incr;
    2840          13 :                 offset_incr = offset_incr >> 1;
    2841          13 :                 max = max >> 1;
    2842          13 :                 i++;
    2843          13 :         } while (i <= sb->s_blocksize_bits + 1);
    2844             : 
    2845           1 :         spin_lock_init(&sbi->s_md_lock);
    2846           1 :         spin_lock_init(&sbi->s_bal_lock);
    2847           1 :         sbi->s_mb_free_pending = 0;
    2848           1 :         INIT_LIST_HEAD(&sbi->s_freed_data_list);
    2849             : 
    2850           1 :         sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
    2851           1 :         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
    2852           1 :         sbi->s_mb_stats = MB_DEFAULT_STATS;
    2853           1 :         sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
    2854           1 :         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
    2855           1 :         sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
    2856             :         /*
    2857             :          * The default group preallocation is 512, which for 4k block
    2858             :          * sizes translates to 2 megabytes.  However for bigalloc file
    2859             :          * systems, this is probably too big (i.e, if the cluster size
    2860             :          * is 1 megabyte, then group preallocation size becomes half a
    2861             :          * gigabyte!).  As a default, we will keep a two megabyte
    2862             :          * group pralloc size for cluster sizes up to 64k, and after
    2863             :          * that, we will force a minimum group preallocation size of
    2864             :          * 32 clusters.  This translates to 8 megs when the cluster
    2865             :          * size is 256k, and 32 megs when the cluster size is 1 meg,
    2866             :          * which seems reasonable as a default.
    2867             :          */
    2868           1 :         sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
    2869             :                                        sbi->s_cluster_bits, 32);
    2870             :         /*
    2871             :          * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
    2872             :          * to the lowest multiple of s_stripe which is bigger than
    2873             :          * the s_mb_group_prealloc as determined above. We want
    2874             :          * the preallocation size to be an exact multiple of the
    2875             :          * RAID stripe size so that preallocations don't fragment
    2876             :          * the stripes.
    2877             :          */
    2878           1 :         if (sbi->s_stripe > 1) {
    2879           0 :                 sbi->s_mb_group_prealloc = roundup(
    2880             :                         sbi->s_mb_group_prealloc, sbi->s_stripe);
    2881             :         }
    2882             : 
    2883           1 :         sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
    2884           1 :         if (sbi->s_locality_groups == NULL) {
    2885           0 :                 ret = -ENOMEM;
    2886           0 :                 goto out;
    2887             :         }
    2888           5 :         for_each_possible_cpu(i) {
    2889           4 :                 struct ext4_locality_group *lg;
    2890           4 :                 lg = per_cpu_ptr(sbi->s_locality_groups, i);
    2891           4 :                 mutex_init(&lg->lg_mutex);
    2892          48 :                 for (j = 0; j < PREALLOC_TB_SIZE; j++)
    2893          40 :                         INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
    2894           5 :                 spin_lock_init(&lg->lg_prealloc_lock);
    2895             :         }
    2896             : 
    2897             :         /* init file for buddy data */
    2898           1 :         ret = ext4_mb_init_backend(sb);
    2899           1 :         if (ret != 0)
    2900           0 :                 goto out_free_locality_groups;
    2901             : 
    2902             :         return 0;
    2903             : 
    2904           0 : out_free_locality_groups:
    2905           0 :         free_percpu(sbi->s_locality_groups);
    2906           0 :         sbi->s_locality_groups = NULL;
    2907           0 : out:
    2908           0 :         kfree(sbi->s_mb_offsets);
    2909           0 :         sbi->s_mb_offsets = NULL;
    2910           0 :         kfree(sbi->s_mb_maxs);
    2911           0 :         sbi->s_mb_maxs = NULL;
    2912           0 :         return ret;
    2913             : }
    2914             : 
    2915             : /* need to called with the ext4 group lock held */
    2916           0 : static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
    2917             : {
    2918           0 :         struct ext4_prealloc_space *pa;
    2919           0 :         struct list_head *cur, *tmp;
    2920           0 :         int count = 0;
    2921             : 
    2922           0 :         list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
    2923           0 :                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
    2924           0 :                 list_del(&pa->pa_group_list);
    2925           0 :                 count++;
    2926           0 :                 kmem_cache_free(ext4_pspace_cachep, pa);
    2927             :         }
    2928           0 :         return count;
    2929             : }
    2930             : 
    2931           0 : int ext4_mb_release(struct super_block *sb)
    2932             : {
    2933           0 :         ext4_group_t ngroups = ext4_get_groups_count(sb);
    2934           0 :         ext4_group_t i;
    2935           0 :         int num_meta_group_infos;
    2936           0 :         struct ext4_group_info *grinfo, ***group_info;
    2937           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2938           0 :         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
    2939           0 :         int count;
    2940             : 
    2941           0 :         if (sbi->s_group_info) {
    2942           0 :                 for (i = 0; i < ngroups; i++) {
    2943           0 :                         cond_resched();
    2944           0 :                         grinfo = ext4_get_group_info(sb, i);
    2945           0 :                         mb_group_bb_bitmap_free(grinfo);
    2946           0 :                         ext4_lock_group(sb, i);
    2947           0 :                         count = ext4_mb_cleanup_pa(grinfo);
    2948           0 :                         if (count)
    2949             :                                 mb_debug(sb, "mballoc: %d PAs left\n",
    2950             :                                          count);
    2951           0 :                         ext4_unlock_group(sb, i);
    2952           0 :                         kmem_cache_free(cachep, grinfo);
    2953             :                 }
    2954           0 :                 num_meta_group_infos = (ngroups +
    2955           0 :                                 EXT4_DESC_PER_BLOCK(sb) - 1) >>
    2956           0 :                         EXT4_DESC_PER_BLOCK_BITS(sb);
    2957           0 :                 rcu_read_lock();
    2958           0 :                 group_info = rcu_dereference(sbi->s_group_info);
    2959           0 :                 for (i = 0; i < num_meta_group_infos; i++)
    2960           0 :                         kfree(group_info[i]);
    2961           0 :                 kvfree(group_info);
    2962           0 :                 rcu_read_unlock();
    2963             :         }
    2964           0 :         kfree(sbi->s_mb_offsets);
    2965           0 :         kfree(sbi->s_mb_maxs);
    2966           0 :         iput(sbi->s_buddy_cache);
    2967           0 :         if (sbi->s_mb_stats) {
    2968           0 :                 ext4_msg(sb, KERN_INFO,
    2969             :                        "mballoc: %u blocks %u reqs (%u success)",
    2970             :                                 atomic_read(&sbi->s_bal_allocated),
    2971             :                                 atomic_read(&sbi->s_bal_reqs),
    2972             :                                 atomic_read(&sbi->s_bal_success));
    2973           0 :                 ext4_msg(sb, KERN_INFO,
    2974             :                       "mballoc: %u extents scanned, %u goal hits, "
    2975             :                                 "%u 2^N hits, %u breaks, %u lost",
    2976             :                                 atomic_read(&sbi->s_bal_ex_scanned),
    2977             :                                 atomic_read(&sbi->s_bal_goals),
    2978             :                                 atomic_read(&sbi->s_bal_2orders),
    2979             :                                 atomic_read(&sbi->s_bal_breaks),
    2980             :                                 atomic_read(&sbi->s_mb_lost_chunks));
    2981           0 :                 ext4_msg(sb, KERN_INFO,
    2982             :                        "mballoc: %lu generated and it took %Lu",
    2983             :                                 sbi->s_mb_buddies_generated,
    2984             :                                 sbi->s_mb_generation_time);
    2985           0 :                 ext4_msg(sb, KERN_INFO,
    2986             :                        "mballoc: %u preallocated, %u discarded",
    2987             :                                 atomic_read(&sbi->s_mb_preallocated),
    2988             :                                 atomic_read(&sbi->s_mb_discarded));
    2989             :         }
    2990             : 
    2991           0 :         free_percpu(sbi->s_locality_groups);
    2992             : 
    2993           0 :         return 0;
    2994             : }
    2995             : 
    2996           0 : static inline int ext4_issue_discard(struct super_block *sb,
    2997             :                 ext4_group_t block_group, ext4_grpblk_t cluster, int count,
    2998             :                 struct bio **biop)
    2999             : {
    3000           0 :         ext4_fsblk_t discard_block;
    3001             : 
    3002           0 :         discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
    3003           0 :                          ext4_group_first_block_no(sb, block_group));
    3004           0 :         count = EXT4_C2B(EXT4_SB(sb), count);
    3005           0 :         trace_ext4_discard_blocks(sb,
    3006             :                         (unsigned long long) discard_block, count);
    3007           0 :         if (biop) {
    3008           0 :                 return __blkdev_issue_discard(sb->s_bdev,
    3009             :                         (sector_t)discard_block << (sb->s_blocksize_bits - 9),
    3010           0 :                         (sector_t)count << (sb->s_blocksize_bits - 9),
    3011             :                         GFP_NOFS, 0, biop);
    3012             :         } else
    3013           0 :                 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
    3014             : }
    3015             : 
    3016         164 : static void ext4_free_data_in_buddy(struct super_block *sb,
    3017             :                                     struct ext4_free_data *entry)
    3018             : {
    3019         164 :         struct ext4_buddy e4b;
    3020         164 :         struct ext4_group_info *db;
    3021         164 :         int err, count = 0, count2 = 0;
    3022             : 
    3023         164 :         mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
    3024             :                  entry->efd_count, entry->efd_group, entry);
    3025             : 
    3026         164 :         err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
    3027             :         /* we expect to find existing buddy because it's pinned */
    3028         164 :         BUG_ON(err != 0);
    3029             : 
    3030         164 :         spin_lock(&EXT4_SB(sb)->s_md_lock);
    3031         164 :         EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
    3032         164 :         spin_unlock(&EXT4_SB(sb)->s_md_lock);
    3033             : 
    3034         164 :         db = e4b.bd_info;
    3035             :         /* there are blocks to put in buddy to make them really free */
    3036         164 :         count += entry->efd_count;
    3037         164 :         count2++;
    3038         164 :         ext4_lock_group(sb, entry->efd_group);
    3039             :         /* Take it out of per group rb tree */
    3040         164 :         rb_erase(&entry->efd_node, &(db->bb_free_root));
    3041         164 :         mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
    3042             : 
    3043             :         /*
    3044             :          * Clear the trimmed flag for the group so that the next
    3045             :          * ext4_trim_fs can trim it.
    3046             :          * If the volume is mounted with -o discard, online discard
    3047             :          * is supported and the free blocks will be trimmed online.
    3048             :          */
    3049         164 :         if (!test_opt(sb, DISCARD))
    3050         164 :                 EXT4_MB_GRP_CLEAR_TRIMMED(db);
    3051             : 
    3052         164 :         if (!db->bb_free_root.rb_node) {
    3053             :                 /* No more items in the per group rb tree
    3054             :                  * balance refcounts from ext4_mb_free_metadata()
    3055             :                  */
    3056          79 :                 put_page(e4b.bd_buddy_page);
    3057          79 :                 put_page(e4b.bd_bitmap_page);
    3058             :         }
    3059         164 :         ext4_unlock_group(sb, entry->efd_group);
    3060         164 :         kmem_cache_free(ext4_free_data_cachep, entry);
    3061         164 :         ext4_mb_unload_buddy(&e4b);
    3062             : 
    3063         164 :         mb_debug(sb, "freed %d blocks in %d structures\n", count,
    3064             :                  count2);
    3065         164 : }
    3066             : 
    3067             : /*
    3068             :  * This function is called by the jbd2 layer once the commit has finished,
    3069             :  * so we know we can free the blocks that were released with that commit.
    3070             :  */
    3071          58 : void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
    3072             : {
    3073          58 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3074          58 :         struct ext4_free_data *entry, *tmp;
    3075          58 :         struct bio *discard_bio = NULL;
    3076          58 :         struct list_head freed_data_list;
    3077          58 :         struct list_head *cut_pos = NULL;
    3078          58 :         int err;
    3079             : 
    3080          58 :         INIT_LIST_HEAD(&freed_data_list);
    3081             : 
    3082          58 :         spin_lock(&sbi->s_md_lock);
    3083         222 :         list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
    3084         164 :                 if (entry->efd_tid != commit_tid)
    3085             :                         break;
    3086         164 :                 cut_pos = &entry->efd_list;
    3087             :         }
    3088          58 :         if (cut_pos)
    3089          42 :                 list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
    3090             :                                   cut_pos);
    3091          58 :         spin_unlock(&sbi->s_md_lock);
    3092             : 
    3093          58 :         if (test_opt(sb, DISCARD)) {
    3094           0 :                 list_for_each_entry(entry, &freed_data_list, efd_list) {
    3095           0 :                         err = ext4_issue_discard(sb, entry->efd_group,
    3096             :                                                  entry->efd_start_cluster,
    3097             :                                                  entry->efd_count,
    3098             :                                                  &discard_bio);
    3099           0 :                         if (err && err != -EOPNOTSUPP) {
    3100           0 :                                 ext4_msg(sb, KERN_WARNING, "discard request in"
    3101             :                                          " group:%d block:%d count:%d failed"
    3102             :                                          " with %d", entry->efd_group,
    3103             :                                          entry->efd_start_cluster,
    3104             :                                          entry->efd_count, err);
    3105           0 :                         } else if (err == -EOPNOTSUPP)
    3106             :                                 break;
    3107             :                 }
    3108             : 
    3109           0 :                 if (discard_bio) {
    3110           0 :                         submit_bio_wait(discard_bio);
    3111           0 :                         bio_put(discard_bio);
    3112             :                 }
    3113             :         }
    3114             : 
    3115         222 :         list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
    3116         164 :                 ext4_free_data_in_buddy(sb, entry);
    3117          58 : }
    3118             : 
    3119           1 : int __init ext4_init_mballoc(void)
    3120             : {
    3121           1 :         ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
    3122             :                                         SLAB_RECLAIM_ACCOUNT);
    3123           1 :         if (ext4_pspace_cachep == NULL)
    3124           0 :                 goto out;
    3125             : 
    3126           1 :         ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
    3127             :                                     SLAB_RECLAIM_ACCOUNT);
    3128           1 :         if (ext4_ac_cachep == NULL)
    3129           0 :                 goto out_pa_free;
    3130             : 
    3131           1 :         ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
    3132             :                                            SLAB_RECLAIM_ACCOUNT);
    3133           1 :         if (ext4_free_data_cachep == NULL)
    3134           0 :                 goto out_ac_free;
    3135             : 
    3136             :         return 0;
    3137             : 
    3138           0 : out_ac_free:
    3139           0 :         kmem_cache_destroy(ext4_ac_cachep);
    3140           0 : out_pa_free:
    3141           0 :         kmem_cache_destroy(ext4_pspace_cachep);
    3142             : out:
    3143             :         return -ENOMEM;
    3144             : }
    3145             : 
    3146           0 : void ext4_exit_mballoc(void)
    3147             : {
    3148             :         /*
    3149             :          * Wait for completion of call_rcu()'s on ext4_pspace_cachep
    3150             :          * before destroying the slab cache.
    3151             :          */
    3152           0 :         rcu_barrier();
    3153           0 :         kmem_cache_destroy(ext4_pspace_cachep);
    3154           0 :         kmem_cache_destroy(ext4_ac_cachep);
    3155           0 :         kmem_cache_destroy(ext4_free_data_cachep);
    3156           0 :         ext4_groupinfo_destroy_slabs();
    3157           0 : }
    3158             : 
    3159             : 
    3160             : /*
    3161             :  * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
    3162             :  * Returns 0 if success or error code
    3163             :  */
    3164             : static noinline_for_stack int
    3165         245 : ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
    3166             :                                 handle_t *handle, unsigned int reserv_clstrs)
    3167             : {
    3168         245 :         struct buffer_head *bitmap_bh = NULL;
    3169         245 :         struct ext4_group_desc *gdp;
    3170         245 :         struct buffer_head *gdp_bh;
    3171         245 :         struct ext4_sb_info *sbi;
    3172         245 :         struct super_block *sb;
    3173         245 :         ext4_fsblk_t block;
    3174         245 :         int err, len;
    3175             : 
    3176         245 :         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
    3177         245 :         BUG_ON(ac->ac_b_ex.fe_len <= 0);
    3178             : 
    3179         245 :         sb = ac->ac_sb;
    3180         245 :         sbi = EXT4_SB(sb);
    3181             : 
    3182         245 :         bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
    3183         245 :         if (IS_ERR(bitmap_bh)) {
    3184           0 :                 err = PTR_ERR(bitmap_bh);
    3185           0 :                 bitmap_bh = NULL;
    3186           0 :                 goto out_err;
    3187             :         }
    3188             : 
    3189         245 :         BUFFER_TRACE(bitmap_bh, "getting write access");
    3190         245 :         err = ext4_journal_get_write_access(handle, bitmap_bh);
    3191         245 :         if (err)
    3192           0 :                 goto out_err;
    3193             : 
    3194         245 :         err = -EIO;
    3195         245 :         gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
    3196         245 :         if (!gdp)
    3197           0 :                 goto out_err;
    3198             : 
    3199         245 :         ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
    3200             :                         ext4_free_group_clusters(sb, gdp));
    3201             : 
    3202         245 :         BUFFER_TRACE(gdp_bh, "get_write_access");
    3203         245 :         err = ext4_journal_get_write_access(handle, gdp_bh);
    3204         245 :         if (err)
    3205           0 :                 goto out_err;
    3206             : 
    3207         245 :         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    3208             : 
    3209         245 :         len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    3210         245 :         if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
    3211           0 :                 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
    3212             :                            "fs metadata", block, block+len);
    3213             :                 /* File system mounted not to panic on error
    3214             :                  * Fix the bitmap and return EFSCORRUPTED
    3215             :                  * We leak some of the blocks here.
    3216             :                  */
    3217           0 :                 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
    3218           0 :                 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
    3219             :                               ac->ac_b_ex.fe_len);
    3220           0 :                 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
    3221           0 :                 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    3222           0 :                 if (!err)
    3223           0 :                         err = -EFSCORRUPTED;
    3224           0 :                 goto out_err;
    3225             :         }
    3226             : 
    3227         245 :         ext4_lock_group(sb, ac->ac_b_ex.fe_group);
    3228             : #ifdef AGGRESSIVE_CHECK
    3229             :         {
    3230             :                 int i;
    3231             :                 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
    3232             :                         BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
    3233             :                                                 bitmap_bh->b_data));
    3234             :                 }
    3235             :         }
    3236             : #endif
    3237         245 :         ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
    3238             :                       ac->ac_b_ex.fe_len);
    3239         245 :         if (ext4_has_group_desc_csum(sb) &&
    3240         245 :             (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    3241           0 :                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
    3242           0 :                 ext4_free_group_clusters_set(sb, gdp,
    3243             :                                              ext4_free_clusters_after_init(sb,
    3244             :                                                 ac->ac_b_ex.fe_group, gdp));
    3245             :         }
    3246         245 :         len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
    3247         245 :         ext4_free_group_clusters_set(sb, gdp, len);
    3248         245 :         ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
    3249         245 :         ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
    3250             : 
    3251         245 :         ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
    3252         245 :         percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
    3253             :         /*
    3254             :          * Now reduce the dirty block count also. Should not go negative
    3255             :          */
    3256         245 :         if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
    3257             :                 /* release all the reserved blocks if non delalloc */
    3258         173 :                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
    3259             :                                    reserv_clstrs);
    3260             : 
    3261         245 :         if (sbi->s_log_groups_per_flex) {
    3262         245 :                 ext4_group_t flex_group = ext4_flex_group(sbi,
    3263             :                                                           ac->ac_b_ex.fe_group);
    3264         735 :                 atomic64_sub(ac->ac_b_ex.fe_len,
    3265         490 :                              &sbi_array_rcu_deref(sbi, s_flex_groups,
    3266             :                                                   flex_group)->free_clusters);
    3267             :         }
    3268             : 
    3269         245 :         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    3270         245 :         if (err)
    3271           0 :                 goto out_err;
    3272         245 :         err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
    3273             : 
    3274         245 : out_err:
    3275         245 :         brelse(bitmap_bh);
    3276         245 :         return err;
    3277             : }
    3278             : 
    3279             : /*
    3280             :  * Idempotent helper for Ext4 fast commit replay path to set the state of
    3281             :  * blocks in bitmaps and update counters.
    3282             :  */
    3283           0 : void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
    3284             :                         int len, int state)
    3285             : {
    3286           0 :         struct buffer_head *bitmap_bh = NULL;
    3287           0 :         struct ext4_group_desc *gdp;
    3288           0 :         struct buffer_head *gdp_bh;
    3289           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3290           0 :         ext4_group_t group;
    3291           0 :         ext4_grpblk_t blkoff;
    3292           0 :         int i, clen, err;
    3293           0 :         int already;
    3294             : 
    3295           0 :         clen = EXT4_B2C(sbi, len);
    3296             : 
    3297           0 :         ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
    3298           0 :         bitmap_bh = ext4_read_block_bitmap(sb, group);
    3299           0 :         if (IS_ERR(bitmap_bh)) {
    3300           0 :                 err = PTR_ERR(bitmap_bh);
    3301           0 :                 bitmap_bh = NULL;
    3302           0 :                 goto out_err;
    3303             :         }
    3304             : 
    3305           0 :         err = -EIO;
    3306           0 :         gdp = ext4_get_group_desc(sb, group, &gdp_bh);
    3307           0 :         if (!gdp)
    3308           0 :                 goto out_err;
    3309             : 
    3310           0 :         ext4_lock_group(sb, group);
    3311           0 :         already = 0;
    3312           0 :         for (i = 0; i < clen; i++)
    3313           0 :                 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
    3314           0 :                         already++;
    3315             : 
    3316           0 :         if (state)
    3317           0 :                 ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
    3318             :         else
    3319           0 :                 mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
    3320           0 :         if (ext4_has_group_desc_csum(sb) &&
    3321           0 :             (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    3322           0 :                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
    3323           0 :                 ext4_free_group_clusters_set(sb, gdp,
    3324             :                                              ext4_free_clusters_after_init(sb,
    3325             :                                                 group, gdp));
    3326             :         }
    3327           0 :         if (state)
    3328           0 :                 clen = ext4_free_group_clusters(sb, gdp) - clen + already;
    3329             :         else
    3330           0 :                 clen = ext4_free_group_clusters(sb, gdp) + clen - already;
    3331             : 
    3332           0 :         ext4_free_group_clusters_set(sb, gdp, clen);
    3333           0 :         ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
    3334           0 :         ext4_group_desc_csum_set(sb, group, gdp);
    3335             : 
    3336           0 :         ext4_unlock_group(sb, group);
    3337             : 
    3338           0 :         if (sbi->s_log_groups_per_flex) {
    3339           0 :                 ext4_group_t flex_group = ext4_flex_group(sbi, group);
    3340             : 
    3341           0 :                 atomic64_sub(len,
    3342           0 :                              &sbi_array_rcu_deref(sbi, s_flex_groups,
    3343             :                                                   flex_group)->free_clusters);
    3344             :         }
    3345             : 
    3346           0 :         err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
    3347           0 :         if (err)
    3348           0 :                 goto out_err;
    3349           0 :         sync_dirty_buffer(bitmap_bh);
    3350           0 :         err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
    3351           0 :         sync_dirty_buffer(gdp_bh);
    3352             : 
    3353           0 : out_err:
    3354           0 :         brelse(bitmap_bh);
    3355           0 : }
    3356             : 
    3357             : /*
    3358             :  * here we normalize request for locality group
    3359             :  * Group request are normalized to s_mb_group_prealloc, which goes to
    3360             :  * s_strip if we set the same via mount option.
    3361             :  * s_mb_group_prealloc can be configured via
    3362             :  * /sys/fs/ext4/<partition>/mb_group_prealloc
    3363             :  *
    3364             :  * XXX: should we try to preallocate more than the group has now?
    3365             :  */
    3366           4 : static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
    3367             : {
    3368           4 :         struct super_block *sb = ac->ac_sb;
    3369           4 :         struct ext4_locality_group *lg = ac->ac_lg;
    3370             : 
    3371           4 :         BUG_ON(lg == NULL);
    3372           4 :         ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
    3373           4 :         mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
    3374           4 : }
    3375             : 
    3376             : /*
    3377             :  * Normalization means making request better in terms of
    3378             :  * size and alignment
    3379             :  */
    3380             : static noinline_for_stack void
    3381         197 : ext4_mb_normalize_request(struct ext4_allocation_context *ac,
    3382             :                                 struct ext4_allocation_request *ar)
    3383             : {
    3384         197 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    3385         197 :         int bsbits, max;
    3386         197 :         ext4_lblk_t end;
    3387         197 :         loff_t size, start_off;
    3388         197 :         loff_t orig_size __maybe_unused;
    3389         197 :         ext4_lblk_t start;
    3390         197 :         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
    3391         197 :         struct ext4_prealloc_space *pa;
    3392             : 
    3393             :         /* do normalize only data requests, metadata requests
    3394             :            do not need preallocation */
    3395         197 :         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
    3396             :                 return;
    3397             : 
    3398             :         /* sometime caller may want exact blocks */
    3399          22 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
    3400             :                 return;
    3401             : 
    3402             :         /* caller may indicate that preallocation isn't
    3403             :          * required (it's a tail, for example) */
    3404          22 :         if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
    3405             :                 return;
    3406             : 
    3407          13 :         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
    3408           4 :                 ext4_mb_normalize_group_request(ac);
    3409           4 :                 return ;
    3410             :         }
    3411             : 
    3412           9 :         bsbits = ac->ac_sb->s_blocksize_bits;
    3413             : 
    3414             :         /* first, let's learn actual file size
    3415             :          * given current request is allocated */
    3416           9 :         size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
    3417           9 :         size = size << bsbits;
    3418           9 :         if (size < i_size_read(ac->ac_inode))
    3419             :                 size = i_size_read(ac->ac_inode);
    3420           9 :         orig_size = size;
    3421             : 
    3422             :         /* max size of free chunks */
    3423           9 :         max = 2 << bsbits;
    3424             : 
    3425             : #define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
    3426             :                 (req <= (size) || max <= (chunk_size))
    3427             : 
    3428             :         /* first, try to predict filesize */
    3429             :         /* XXX: should this table be tunable? */
    3430           9 :         start_off = 0;
    3431           9 :         if (size <= 16 * 1024) {
    3432             :                 size = 16 * 1024;
    3433           9 :         } else if (size <= 32 * 1024) {
    3434             :                 size = 32 * 1024;
    3435           9 :         } else if (size <= 64 * 1024) {
    3436             :                 size = 64 * 1024;
    3437           9 :         } else if (size <= 128 * 1024) {
    3438             :                 size = 128 * 1024;
    3439           9 :         } else if (size <= 256 * 1024) {
    3440             :                 size = 256 * 1024;
    3441           8 :         } else if (size <= 512 * 1024) {
    3442             :                 size = 512 * 1024;
    3443           0 :         } else if (size <= 1024 * 1024) {
    3444             :                 size = 1024 * 1024;
    3445           0 :         } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
    3446           0 :                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
    3447           0 :                                                 (21 - bsbits)) << 21;
    3448           0 :                 size = 2 * 1024 * 1024;
    3449           0 :         } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
    3450           0 :                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
    3451           0 :                                                         (22 - bsbits)) << 22;
    3452           0 :                 size = 4 * 1024 * 1024;
    3453           0 :         } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
    3454             :                                         (8<<20)>>bsbits, max, 8 * 1024)) {
    3455           0 :                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
    3456           0 :                                                         (23 - bsbits)) << 23;
    3457           0 :                 size = 8 * 1024 * 1024;
    3458             :         } else {
    3459           0 :                 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
    3460           0 :                 size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
    3461             :                                               ac->ac_o_ex.fe_len) << bsbits;
    3462             :         }
    3463           9 :         size = size >> bsbits;
    3464           9 :         start = start_off >> bsbits;
    3465             : 
    3466             :         /* don't cover already allocated blocks in selected range */
    3467           9 :         if (ar->pleft && start <= ar->lleft) {
    3468           8 :                 size -= ar->lleft + 1 - start;
    3469           8 :                 start = ar->lleft + 1;
    3470             :         }
    3471           9 :         if (ar->pright && start + size - 1 >= ar->lright)
    3472           0 :                 size -= start + size - ar->lright;
    3473             : 
    3474             :         /*
    3475             :          * Trim allocation request for filesystems with artificially small
    3476             :          * groups.
    3477             :          */
    3478           9 :         if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
    3479           0 :                 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
    3480             : 
    3481           9 :         end = start + size;
    3482             : 
    3483             :         /* check we don't cross already preallocated blocks */
    3484           9 :         rcu_read_lock();
    3485           9 :         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
    3486           0 :                 ext4_lblk_t pa_end;
    3487             : 
    3488           0 :                 if (pa->pa_deleted)
    3489           0 :                         continue;
    3490           0 :                 spin_lock(&pa->pa_lock);
    3491           0 :                 if (pa->pa_deleted) {
    3492           0 :                         spin_unlock(&pa->pa_lock);
    3493           0 :                         continue;
    3494             :                 }
    3495             : 
    3496           0 :                 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
    3497             :                                                   pa->pa_len);
    3498             : 
    3499             :                 /* PA must not overlap original request */
    3500           0 :                 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
    3501             :                         ac->ac_o_ex.fe_logical < pa->pa_lstart));
    3502             : 
    3503             :                 /* skip PAs this normalized request doesn't overlap with */
    3504           0 :                 if (pa->pa_lstart >= end || pa_end <= start) {
    3505           0 :                         spin_unlock(&pa->pa_lock);
    3506           0 :                         continue;
    3507             :                 }
    3508           0 :                 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
    3509             : 
    3510             :                 /* adjust start or end to be adjacent to this pa */
    3511           0 :                 if (pa_end <= ac->ac_o_ex.fe_logical) {
    3512             :                         BUG_ON(pa_end < start);
    3513             :                         start = pa_end;
    3514           0 :                 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
    3515           0 :                         BUG_ON(pa->pa_lstart > end);
    3516           0 :                         end = pa->pa_lstart;
    3517             :                 }
    3518           0 :                 spin_unlock(&pa->pa_lock);
    3519             :         }
    3520           9 :         rcu_read_unlock();
    3521           9 :         size = end - start;
    3522             : 
    3523             :         /* XXX: extra loop to check we really don't overlap preallocations */
    3524           9 :         rcu_read_lock();
    3525           9 :         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
    3526           0 :                 ext4_lblk_t pa_end;
    3527             : 
    3528           0 :                 spin_lock(&pa->pa_lock);
    3529           0 :                 if (pa->pa_deleted == 0) {
    3530           0 :                         pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
    3531             :                                                           pa->pa_len);
    3532           0 :                         BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
    3533             :                 }
    3534           0 :                 spin_unlock(&pa->pa_lock);
    3535             :         }
    3536           9 :         rcu_read_unlock();
    3537             : 
    3538           9 :         if (start + size <= ac->ac_o_ex.fe_logical &&
    3539             :                         start > ac->ac_o_ex.fe_logical) {
    3540           0 :                 ext4_msg(ac->ac_sb, KERN_ERR,
    3541             :                          "start %lu, size %lu, fe_logical %lu",
    3542             :                          (unsigned long) start, (unsigned long) size,
    3543             :                          (unsigned long) ac->ac_o_ex.fe_logical);
    3544           0 :                 BUG();
    3545             :         }
    3546           9 :         BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
    3547             : 
    3548             :         /* now prepare goal request */
    3549             : 
    3550             :         /* XXX: is it better to align blocks WRT to logical
    3551             :          * placement or satisfy big request as is */
    3552           9 :         ac->ac_g_ex.fe_logical = start;
    3553           9 :         ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
    3554             : 
    3555             :         /* define goal start in order to merge */
    3556           9 :         if (ar->pright && (ar->lright == (start + size))) {
    3557             :                 /* merge to the right */
    3558           0 :                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
    3559             :                                                 &ac->ac_f_ex.fe_group,
    3560             :                                                 &ac->ac_f_ex.fe_start);
    3561           0 :                 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
    3562             :         }
    3563           9 :         if (ar->pleft && (ar->lleft + 1 == start)) {
    3564             :                 /* merge to the left */
    3565           8 :                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
    3566             :                                                 &ac->ac_f_ex.fe_group,
    3567             :                                                 &ac->ac_f_ex.fe_start);
    3568           8 :                 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
    3569             :         }
    3570             : 
    3571             :         mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
    3572             :                  orig_size, start);
    3573             : }
    3574             : 
    3575         245 : static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
    3576             : {
    3577         245 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    3578             : 
    3579         245 :         if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
    3580           0 :                 atomic_inc(&sbi->s_bal_reqs);
    3581           0 :                 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
    3582           0 :                 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
    3583           0 :                         atomic_inc(&sbi->s_bal_success);
    3584           0 :                 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
    3585           0 :                 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
    3586           0 :                                 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
    3587           0 :                         atomic_inc(&sbi->s_bal_goals);
    3588           0 :                 if (ac->ac_found > sbi->s_mb_max_to_scan)
    3589           0 :                         atomic_inc(&sbi->s_bal_breaks);
    3590             :         }
    3591             : 
    3592         245 :         if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
    3593         197 :                 trace_ext4_mballoc_alloc(ac);
    3594             :         else
    3595          48 :                 trace_ext4_mballoc_prealloc(ac);
    3596         245 : }
    3597             : 
    3598             : /*
    3599             :  * Called on failure; free up any blocks from the inode PA for this
    3600             :  * context.  We don't need this for MB_GROUP_PA because we only change
    3601             :  * pa_free in ext4_mb_release_context(), but on failure, we've already
    3602             :  * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
    3603             :  */
    3604           0 : static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
    3605             : {
    3606           0 :         struct ext4_prealloc_space *pa = ac->ac_pa;
    3607           0 :         struct ext4_buddy e4b;
    3608           0 :         int err;
    3609             : 
    3610           0 :         if (pa == NULL) {
    3611           0 :                 if (ac->ac_f_ex.fe_len == 0)
    3612           0 :                         return;
    3613           0 :                 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
    3614           0 :                 if (err) {
    3615             :                         /*
    3616             :                          * This should never happen since we pin the
    3617             :                          * pages in the ext4_allocation_context so
    3618             :                          * ext4_mb_load_buddy() should never fail.
    3619             :                          */
    3620           0 :                         WARN(1, "mb_load_buddy failed (%d)", err);
    3621           0 :                         return;
    3622             :                 }
    3623           0 :                 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
    3624           0 :                 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
    3625             :                                ac->ac_f_ex.fe_len);
    3626           0 :                 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
    3627           0 :                 ext4_mb_unload_buddy(&e4b);
    3628           0 :                 return;
    3629             :         }
    3630           0 :         if (pa->pa_type == MB_INODE_PA)
    3631           0 :                 pa->pa_free += ac->ac_b_ex.fe_len;
    3632             : }
    3633             : 
    3634             : /*
    3635             :  * use blocks preallocated to inode
    3636             :  */
    3637           2 : static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
    3638             :                                 struct ext4_prealloc_space *pa)
    3639             : {
    3640           2 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    3641           2 :         ext4_fsblk_t start;
    3642           2 :         ext4_fsblk_t end;
    3643           2 :         int len;
    3644             : 
    3645             :         /* found preallocated blocks, use them */
    3646           2 :         start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
    3647           2 :         end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
    3648             :                   start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
    3649           2 :         len = EXT4_NUM_B2C(sbi, end - start);
    3650           2 :         ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
    3651             :                                         &ac->ac_b_ex.fe_start);
    3652           2 :         ac->ac_b_ex.fe_len = len;
    3653           2 :         ac->ac_status = AC_STATUS_FOUND;
    3654           2 :         ac->ac_pa = pa;
    3655             : 
    3656           2 :         BUG_ON(start < pa->pa_pstart);
    3657           2 :         BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
    3658           2 :         BUG_ON(pa->pa_free < len);
    3659           2 :         pa->pa_free -= len;
    3660             : 
    3661           2 :         mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
    3662           2 : }
    3663             : 
    3664             : /*
    3665             :  * use blocks preallocated to locality group
    3666             :  */
    3667          52 : static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
    3668             :                                 struct ext4_prealloc_space *pa)
    3669             : {
    3670          52 :         unsigned int len = ac->ac_o_ex.fe_len;
    3671             : 
    3672          52 :         ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
    3673             :                                         &ac->ac_b_ex.fe_group,
    3674             :                                         &ac->ac_b_ex.fe_start);
    3675          52 :         ac->ac_b_ex.fe_len = len;
    3676          52 :         ac->ac_status = AC_STATUS_FOUND;
    3677          52 :         ac->ac_pa = pa;
    3678             : 
    3679             :         /* we don't correct pa_pstart or pa_plen here to avoid
    3680             :          * possible race when the group is being loaded concurrently
    3681             :          * instead we correct pa later, after blocks are marked
    3682             :          * in on-disk bitmap -- see ext4_mb_release_context()
    3683             :          * Other CPUs are prevented from allocating from this pa by lg_mutex
    3684             :          */
    3685          52 :         mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
    3686             :                  pa->pa_lstart-len, len, pa);
    3687          52 : }
    3688             : 
    3689             : /*
    3690             :  * Return the prealloc space that have minimal distance
    3691             :  * from the goal block. @cpa is the prealloc
    3692             :  * space that is having currently known minimal distance
    3693             :  * from the goal block.
    3694             :  */
    3695             : static struct ext4_prealloc_space *
    3696          48 : ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
    3697             :                         struct ext4_prealloc_space *pa,
    3698             :                         struct ext4_prealloc_space *cpa)
    3699             : {
    3700          48 :         ext4_fsblk_t cur_distance, new_distance;
    3701             : 
    3702          48 :         if (cpa == NULL) {
    3703          48 :                 atomic_inc(&pa->pa_count);
    3704          48 :                 return pa;
    3705             :         }
    3706           0 :         cur_distance = abs(goal_block - cpa->pa_pstart);
    3707           0 :         new_distance = abs(goal_block - pa->pa_pstart);
    3708             : 
    3709           0 :         if (cur_distance <= new_distance)
    3710             :                 return cpa;
    3711             : 
    3712             :         /* drop the previous reference */
    3713           0 :         atomic_dec(&cpa->pa_count);
    3714           0 :         atomic_inc(&pa->pa_count);
    3715           0 :         return pa;
    3716             : }
    3717             : 
    3718             : /*
    3719             :  * search goal blocks in preallocated space
    3720             :  */
    3721             : static noinline_for_stack bool
    3722         245 : ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
    3723             : {
    3724         245 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    3725         245 :         int order, i;
    3726         245 :         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
    3727         245 :         struct ext4_locality_group *lg;
    3728         245 :         struct ext4_prealloc_space *pa, *cpa = NULL;
    3729         245 :         ext4_fsblk_t goal_block;
    3730             : 
    3731             :         /* only data can be preallocated */
    3732         245 :         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
    3733             :                 return false;
    3734             : 
    3735             :         /* first, try per-file preallocation */
    3736          70 :         rcu_read_lock();
    3737          70 :         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
    3738             : 
    3739             :                 /* all fields in this condition don't change,
    3740             :                  * so we can skip locking for them */
    3741           0 :                 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
    3742           0 :                     ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
    3743           0 :                                                EXT4_C2B(sbi, pa->pa_len)))
    3744           0 :                         continue;
    3745             : 
    3746             :                 /* non-extent files can't have physical blocks past 2^32 */
    3747           0 :                 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
    3748           0 :                     (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
    3749             :                      EXT4_MAX_BLOCK_FILE_PHYS))
    3750           0 :                         continue;
    3751             : 
    3752             :                 /* found preallocated blocks, use them */
    3753           0 :                 spin_lock(&pa->pa_lock);
    3754           0 :                 if (pa->pa_deleted == 0 && pa->pa_free) {
    3755           0 :                         atomic_inc(&pa->pa_count);
    3756           0 :                         ext4_mb_use_inode_pa(ac, pa);
    3757           0 :                         spin_unlock(&pa->pa_lock);
    3758           0 :                         ac->ac_criteria = 10;
    3759           0 :                         rcu_read_unlock();
    3760           0 :                         return true;
    3761             :                 }
    3762           0 :                 spin_unlock(&pa->pa_lock);
    3763             :         }
    3764          70 :         rcu_read_unlock();
    3765             : 
    3766             :         /* can we use group allocation? */
    3767          70 :         if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
    3768             :                 return false;
    3769             : 
    3770             :         /* inode may have no locality group for some reason */
    3771          52 :         lg = ac->ac_lg;
    3772          52 :         if (lg == NULL)
    3773             :                 return false;
    3774          52 :         order  = fls(ac->ac_o_ex.fe_len) - 1;
    3775          52 :         if (order > PREALLOC_TB_SIZE - 1)
    3776             :                 /* The max size of hash table is PREALLOC_TB_SIZE */
    3777             :                 order = PREALLOC_TB_SIZE - 1;
    3778             : 
    3779          52 :         goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
    3780             :         /*
    3781             :          * search for the prealloc space that is having
    3782             :          * minimal distance from the goal block.
    3783             :          */
    3784         503 :         for (i = order; i < PREALLOC_TB_SIZE; i++) {
    3785         451 :                 rcu_read_lock();
    3786         499 :                 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
    3787             :                                         pa_inode_list) {
    3788          48 :                         spin_lock(&pa->pa_lock);
    3789          48 :                         if (pa->pa_deleted == 0 &&
    3790          48 :                                         pa->pa_free >= ac->ac_o_ex.fe_len) {
    3791             : 
    3792          48 :                                 cpa = ext4_mb_check_group_pa(goal_block,
    3793             :                                                                 pa, cpa);
    3794             :                         }
    3795          48 :                         spin_unlock(&pa->pa_lock);
    3796             :                 }
    3797         451 :                 rcu_read_unlock();
    3798             :         }
    3799          52 :         if (cpa) {
    3800          48 :                 ext4_mb_use_group_pa(ac, cpa);
    3801          48 :                 ac->ac_criteria = 20;
    3802          48 :                 return true;
    3803             :         }
    3804             :         return false;
    3805             : }
    3806             : 
    3807             : /*
    3808             :  * the function goes through all block freed in the group
    3809             :  * but not yet committed and marks them used in in-core bitmap.
    3810             :  * buddy must be generated from this bitmap
    3811             :  * Need to be called with the ext4 group lock held
    3812             :  */
    3813          16 : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
    3814             :                                                 ext4_group_t group)
    3815             : {
    3816          16 :         struct rb_node *n;
    3817          16 :         struct ext4_group_info *grp;
    3818          16 :         struct ext4_free_data *entry;
    3819             : 
    3820          16 :         grp = ext4_get_group_info(sb, group);
    3821          16 :         n = rb_first(&(grp->bb_free_root));
    3822             : 
    3823          16 :         while (n) {
    3824           0 :                 entry = rb_entry(n, struct ext4_free_data, efd_node);
    3825           0 :                 ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
    3826           0 :                 n = rb_next(n);
    3827             :         }
    3828          16 :         return;
    3829             : }
    3830             : 
    3831             : /*
    3832             :  * the function goes through all preallocation in this group and marks them
    3833             :  * used in in-core bitmap. buddy must be generated from this bitmap
    3834             :  * Need to be called with ext4 group lock held
    3835             :  */
    3836             : static noinline_for_stack
    3837          16 : void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
    3838             :                                         ext4_group_t group)
    3839             : {
    3840          16 :         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
    3841          16 :         struct ext4_prealloc_space *pa;
    3842          16 :         struct list_head *cur;
    3843          16 :         ext4_group_t groupnr;
    3844          16 :         ext4_grpblk_t start;
    3845          16 :         int preallocated = 0;
    3846          16 :         int len;
    3847             : 
    3848             :         /* all form of preallocation discards first load group,
    3849             :          * so the only competing code is preallocation use.
    3850             :          * we don't need any locking here
    3851             :          * notice we do NOT ignore preallocations with pa_deleted
    3852             :          * otherwise we could leave used blocks available for
    3853             :          * allocation in buddy when concurrent ext4_mb_put_pa()
    3854             :          * is dropping preallocation
    3855             :          */
    3856          16 :         list_for_each(cur, &grp->bb_prealloc_list) {
    3857           0 :                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
    3858           0 :                 spin_lock(&pa->pa_lock);
    3859           0 :                 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
    3860             :                                              &groupnr, &start);
    3861           0 :                 len = pa->pa_len;
    3862           0 :                 spin_unlock(&pa->pa_lock);
    3863           0 :                 if (unlikely(len == 0))
    3864           0 :                         continue;
    3865           0 :                 BUG_ON(groupnr != group);
    3866           0 :                 ext4_set_bits(bitmap, start, len);
    3867           0 :                 preallocated += len;
    3868             :         }
    3869          16 :         mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
    3870          16 : }
    3871             : 
    3872           2 : static void ext4_mb_mark_pa_deleted(struct super_block *sb,
    3873             :                                     struct ext4_prealloc_space *pa)
    3874             : {
    3875           2 :         struct ext4_inode_info *ei;
    3876             : 
    3877           2 :         if (pa->pa_deleted) {
    3878           0 :                 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
    3879             :                              pa->pa_type, pa->pa_pstart, pa->pa_lstart,
    3880             :                              pa->pa_len);
    3881           0 :                 return;
    3882             :         }
    3883             : 
    3884           2 :         pa->pa_deleted = 1;
    3885             : 
    3886           2 :         if (pa->pa_type == MB_INODE_PA) {
    3887           2 :                 ei = EXT4_I(pa->pa_inode);
    3888           2 :                 atomic_dec(&ei->i_prealloc_active);
    3889             :         }
    3890             : }
    3891             : 
    3892           2 : static void ext4_mb_pa_callback(struct rcu_head *head)
    3893             : {
    3894           2 :         struct ext4_prealloc_space *pa;
    3895           2 :         pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
    3896             : 
    3897           2 :         BUG_ON(atomic_read(&pa->pa_count));
    3898           2 :         BUG_ON(pa->pa_deleted == 0);
    3899           2 :         kmem_cache_free(ext4_pspace_cachep, pa);
    3900           2 : }
    3901             : 
    3902             : /*
    3903             :  * drops a reference to preallocated space descriptor
    3904             :  * if this was the last reference and the space is consumed
    3905             :  */
    3906          54 : static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
    3907             :                         struct super_block *sb, struct ext4_prealloc_space *pa)
    3908             : {
    3909          54 :         ext4_group_t grp;
    3910          54 :         ext4_fsblk_t grp_blk;
    3911             : 
    3912             :         /* in this short window concurrent discard can set pa_deleted */
    3913          54 :         spin_lock(&pa->pa_lock);
    3914         108 :         if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
    3915          54 :                 spin_unlock(&pa->pa_lock);
    3916          54 :                 return;
    3917             :         }
    3918             : 
    3919           0 :         if (pa->pa_deleted == 1) {
    3920           0 :                 spin_unlock(&pa->pa_lock);
    3921           0 :                 return;
    3922             :         }
    3923             : 
    3924           0 :         ext4_mb_mark_pa_deleted(sb, pa);
    3925           0 :         spin_unlock(&pa->pa_lock);
    3926             : 
    3927           0 :         grp_blk = pa->pa_pstart;
    3928             :         /*
    3929             :          * If doing group-based preallocation, pa_pstart may be in the
    3930             :          * next group when pa is used up
    3931             :          */
    3932           0 :         if (pa->pa_type == MB_GROUP_PA)
    3933           0 :                 grp_blk--;
    3934             : 
    3935           0 :         grp = ext4_get_group_number(sb, grp_blk);
    3936             : 
    3937             :         /*
    3938             :          * possible race:
    3939             :          *
    3940             :          *  P1 (buddy init)                     P2 (regular allocation)
    3941             :          *                                      find block B in PA
    3942             :          *  copy on-disk bitmap to buddy
    3943             :          *                                      mark B in on-disk bitmap
    3944             :          *                                      drop PA from group
    3945             :          *  mark all PAs in buddy
    3946             :          *
    3947             :          * thus, P1 initializes buddy with B available. to prevent this
    3948             :          * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
    3949             :          * against that pair
    3950             :          */
    3951           0 :         ext4_lock_group(sb, grp);
    3952           0 :         list_del(&pa->pa_group_list);
    3953           0 :         ext4_unlock_group(sb, grp);
    3954             : 
    3955           0 :         spin_lock(pa->pa_obj_lock);
    3956           0 :         list_del_rcu(&pa->pa_inode_list);
    3957           0 :         spin_unlock(pa->pa_obj_lock);
    3958             : 
    3959           0 :         call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
    3960             : }
    3961             : 
    3962             : /*
    3963             :  * creates new preallocated space for given inode
    3964             :  */
    3965             : static noinline_for_stack void
    3966           2 : ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
    3967             : {
    3968           2 :         struct super_block *sb = ac->ac_sb;
    3969           2 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3970           2 :         struct ext4_prealloc_space *pa;
    3971           2 :         struct ext4_group_info *grp;
    3972           2 :         struct ext4_inode_info *ei;
    3973             : 
    3974             :         /* preallocate only when found space is larger then requested */
    3975           2 :         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
    3976           2 :         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
    3977           2 :         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
    3978           2 :         BUG_ON(ac->ac_pa == NULL);
    3979             : 
    3980           2 :         pa = ac->ac_pa;
    3981             : 
    3982           2 :         if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
    3983           1 :                 int winl;
    3984           1 :                 int wins;
    3985           1 :                 int win;
    3986           1 :                 int offs;
    3987             : 
    3988             :                 /* we can't allocate as much as normalizer wants.
    3989             :                  * so, found space must get proper lstart
    3990             :                  * to cover original request */
    3991           1 :                 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
    3992           1 :                 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
    3993             : 
    3994             :                 /* we're limited by original request in that
    3995             :                  * logical block must be covered any way
    3996             :                  * winl is window we can move our chunk within */
    3997           1 :                 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
    3998             : 
    3999             :                 /* also, we should cover whole original request */
    4000           1 :                 wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
    4001             : 
    4002             :                 /* the smallest one defines real window */
    4003           1 :                 win = min(winl, wins);
    4004             : 
    4005           1 :                 offs = ac->ac_o_ex.fe_logical %
    4006           1 :                         EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    4007           1 :                 if (offs && offs < win)
    4008           0 :                         win = offs;
    4009             : 
    4010           1 :                 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
    4011           1 :                         EXT4_NUM_B2C(sbi, win);
    4012           1 :                 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
    4013           1 :                 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
    4014             :         }
    4015             : 
    4016             :         /* preallocation can change ac_b_ex, thus we store actually
    4017             :          * allocated blocks for history */
    4018           2 :         ac->ac_f_ex = ac->ac_b_ex;
    4019             : 
    4020           2 :         pa->pa_lstart = ac->ac_b_ex.fe_logical;
    4021           2 :         pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    4022           2 :         pa->pa_len = ac->ac_b_ex.fe_len;
    4023           2 :         pa->pa_free = pa->pa_len;
    4024           2 :         spin_lock_init(&pa->pa_lock);
    4025           2 :         INIT_LIST_HEAD(&pa->pa_inode_list);
    4026           2 :         INIT_LIST_HEAD(&pa->pa_group_list);
    4027           2 :         pa->pa_deleted = 0;
    4028           2 :         pa->pa_type = MB_INODE_PA;
    4029             : 
    4030           2 :         mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
    4031             :                  pa->pa_len, pa->pa_lstart);
    4032           2 :         trace_ext4_mb_new_inode_pa(ac, pa);
    4033             : 
    4034           2 :         ext4_mb_use_inode_pa(ac, pa);
    4035           2 :         atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
    4036             : 
    4037           2 :         ei = EXT4_I(ac->ac_inode);
    4038           2 :         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
    4039             : 
    4040           2 :         pa->pa_obj_lock = &ei->i_prealloc_lock;
    4041           2 :         pa->pa_inode = ac->ac_inode;
    4042             : 
    4043           2 :         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
    4044             : 
    4045           2 :         spin_lock(pa->pa_obj_lock);
    4046           2 :         list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
    4047           2 :         spin_unlock(pa->pa_obj_lock);
    4048           2 :         atomic_inc(&ei->i_prealloc_active);
    4049           2 : }
    4050             : 
    4051             : /*
    4052             :  * creates new preallocated space for locality group inodes belongs to
    4053             :  */
    4054             : static noinline_for_stack void
    4055           4 : ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
    4056             : {
    4057           4 :         struct super_block *sb = ac->ac_sb;
    4058           4 :         struct ext4_locality_group *lg;
    4059           4 :         struct ext4_prealloc_space *pa;
    4060           4 :         struct ext4_group_info *grp;
    4061             : 
    4062             :         /* preallocate only when found space is larger then requested */
    4063           4 :         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
    4064           4 :         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
    4065           4 :         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
    4066           4 :         BUG_ON(ac->ac_pa == NULL);
    4067             : 
    4068           4 :         pa = ac->ac_pa;
    4069             : 
    4070             :         /* preallocation can change ac_b_ex, thus we store actually
    4071             :          * allocated blocks for history */
    4072           4 :         ac->ac_f_ex = ac->ac_b_ex;
    4073             : 
    4074           4 :         pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    4075           4 :         pa->pa_lstart = pa->pa_pstart;
    4076           4 :         pa->pa_len = ac->ac_b_ex.fe_len;
    4077           4 :         pa->pa_free = pa->pa_len;
    4078           4 :         spin_lock_init(&pa->pa_lock);
    4079           4 :         INIT_LIST_HEAD(&pa->pa_inode_list);
    4080           4 :         INIT_LIST_HEAD(&pa->pa_group_list);
    4081           4 :         pa->pa_deleted = 0;
    4082           4 :         pa->pa_type = MB_GROUP_PA;
    4083             : 
    4084           4 :         mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
    4085             :                  pa->pa_len, pa->pa_lstart);
    4086           4 :         trace_ext4_mb_new_group_pa(ac, pa);
    4087             : 
    4088           4 :         ext4_mb_use_group_pa(ac, pa);
    4089           4 :         atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
    4090             : 
    4091           4 :         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
    4092           4 :         lg = ac->ac_lg;
    4093           4 :         BUG_ON(lg == NULL);
    4094             : 
    4095           4 :         pa->pa_obj_lock = &lg->lg_prealloc_lock;
    4096           4 :         pa->pa_inode = NULL;
    4097             : 
    4098           4 :         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
    4099             : 
    4100             :         /*
    4101             :          * We will later add the new pa to the right bucket
    4102             :          * after updating the pa_free in ext4_mb_release_context
    4103             :          */
    4104           4 : }
    4105             : 
    4106           6 : static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
    4107             : {
    4108           6 :         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
    4109           4 :                 ext4_mb_new_group_pa(ac);
    4110             :         else
    4111           2 :                 ext4_mb_new_inode_pa(ac);
    4112           6 : }
    4113             : 
    4114             : /*
    4115             :  * finds all unused blocks in on-disk bitmap, frees them in
    4116             :  * in-core bitmap and buddy.
    4117             :  * @pa must be unlinked from inode and group lists, so that
    4118             :  * nobody else can find/use it.
    4119             :  * the caller MUST hold group/inode locks.
    4120             :  * TODO: optimize the case when there are no in-core structures yet
    4121             :  */
    4122             : static noinline_for_stack int
    4123           2 : ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
    4124             :                         struct ext4_prealloc_space *pa)
    4125             : {
    4126           2 :         struct super_block *sb = e4b->bd_sb;
    4127           2 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    4128           2 :         unsigned int end;
    4129           2 :         unsigned int next;
    4130           2 :         ext4_group_t group;
    4131           2 :         ext4_grpblk_t bit;
    4132           2 :         unsigned long long grp_blk_start;
    4133           2 :         int free = 0;
    4134             : 
    4135           2 :         BUG_ON(pa->pa_deleted == 0);
    4136           2 :         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
    4137           2 :         grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
    4138           2 :         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
    4139           2 :         end = bit + pa->pa_len;
    4140             : 
    4141           4 :         while (bit < end) {
    4142           2 :                 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
    4143           2 :                 if (bit >= end)
    4144             :                         break;
    4145           2 :                 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
    4146           2 :                 mb_debug(sb, "free preallocated %u/%u in group %u\n",
    4147             :                          (unsigned) ext4_group_first_block_no(sb, group) + bit,
    4148             :                          (unsigned) next - bit, (unsigned) group);
    4149           2 :                 free += next - bit;
    4150             : 
    4151           2 :                 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
    4152           2 :                 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
    4153           2 :                                                     EXT4_C2B(sbi, bit)),
    4154             :                                                next - bit);
    4155           2 :                 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
    4156           2 :                 bit = next + 1;
    4157             :         }
    4158           2 :         if (free != pa->pa_free) {
    4159           0 :                 ext4_msg(e4b->bd_sb, KERN_CRIT,
    4160             :                          "pa %p: logic %lu, phys. %lu, len %d",
    4161             :                          pa, (unsigned long) pa->pa_lstart,
    4162             :                          (unsigned long) pa->pa_pstart,
    4163             :                          pa->pa_len);
    4164           0 :                 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
    4165             :                                         free, pa->pa_free);
    4166             :                 /*
    4167             :                  * pa is already deleted so we use the value obtained
    4168             :                  * from the bitmap and continue.
    4169             :                  */
    4170             :         }
    4171           2 :         atomic_add(free, &sbi->s_mb_discarded);
    4172             : 
    4173           2 :         return 0;
    4174             : }
    4175             : 
    4176             : static noinline_for_stack int
    4177           0 : ext4_mb_release_group_pa(struct ext4_buddy *e4b,
    4178             :                                 struct ext4_prealloc_space *pa)
    4179             : {
    4180           0 :         struct super_block *sb = e4b->bd_sb;
    4181           0 :         ext4_group_t group;
    4182           0 :         ext4_grpblk_t bit;
    4183             : 
    4184           0 :         trace_ext4_mb_release_group_pa(sb, pa);
    4185           0 :         BUG_ON(pa->pa_deleted == 0);
    4186           0 :         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
    4187           0 :         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
    4188           0 :         mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
    4189           0 :         atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
    4190           0 :         trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
    4191             : 
    4192           0 :         return 0;
    4193             : }
    4194             : 
    4195             : /*
    4196             :  * releases all preallocations in given group
    4197             :  *
    4198             :  * first, we need to decide discard policy:
    4199             :  * - when do we discard
    4200             :  *   1) ENOSPC
    4201             :  * - how many do we discard
    4202             :  *   1) how many requested
    4203             :  */
    4204             : static noinline_for_stack int
    4205           0 : ext4_mb_discard_group_preallocations(struct super_block *sb,
    4206             :                                         ext4_group_t group, int needed)
    4207             : {
    4208           0 :         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
    4209           0 :         struct buffer_head *bitmap_bh = NULL;
    4210           0 :         struct ext4_prealloc_space *pa, *tmp;
    4211           0 :         struct list_head list;
    4212           0 :         struct ext4_buddy e4b;
    4213           0 :         int err;
    4214           0 :         int busy = 0;
    4215           0 :         int free, free_total = 0;
    4216             : 
    4217           0 :         mb_debug(sb, "discard preallocation for group %u\n", group);
    4218           0 :         if (list_empty(&grp->bb_prealloc_list))
    4219           0 :                 goto out_dbg;
    4220             : 
    4221           0 :         bitmap_bh = ext4_read_block_bitmap(sb, group);
    4222           0 :         if (IS_ERR(bitmap_bh)) {
    4223           0 :                 err = PTR_ERR(bitmap_bh);
    4224           0 :                 ext4_error_err(sb, -err,
    4225             :                                "Error %d reading block bitmap for %u",
    4226             :                                err, group);
    4227           0 :                 goto out_dbg;
    4228             :         }
    4229             : 
    4230           0 :         err = ext4_mb_load_buddy(sb, group, &e4b);
    4231           0 :         if (err) {
    4232           0 :                 ext4_warning(sb, "Error %d loading buddy information for %u",
    4233             :                              err, group);
    4234           0 :                 put_bh(bitmap_bh);
    4235           0 :                 goto out_dbg;
    4236             :         }
    4237             : 
    4238           0 :         if (needed == 0)
    4239           0 :                 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
    4240             : 
    4241           0 :         INIT_LIST_HEAD(&list);
    4242           0 : repeat:
    4243           0 :         free = 0;
    4244           0 :         ext4_lock_group(sb, group);
    4245           0 :         list_for_each_entry_safe(pa, tmp,
    4246             :                                 &grp->bb_prealloc_list, pa_group_list) {
    4247           0 :                 spin_lock(&pa->pa_lock);
    4248           0 :                 if (atomic_read(&pa->pa_count)) {
    4249           0 :                         spin_unlock(&pa->pa_lock);
    4250           0 :                         busy = 1;
    4251           0 :                         continue;
    4252             :                 }
    4253           0 :                 if (pa->pa_deleted) {
    4254           0 :                         spin_unlock(&pa->pa_lock);
    4255           0 :                         continue;
    4256             :                 }
    4257             : 
    4258             :                 /* seems this one can be freed ... */
    4259           0 :                 ext4_mb_mark_pa_deleted(sb, pa);
    4260             : 
    4261           0 :                 if (!free)
    4262           0 :                         this_cpu_inc(discard_pa_seq);
    4263             : 
    4264             :                 /* we can trust pa_free ... */
    4265           0 :                 free += pa->pa_free;
    4266             : 
    4267           0 :                 spin_unlock(&pa->pa_lock);
    4268             : 
    4269           0 :                 list_del(&pa->pa_group_list);
    4270           0 :                 list_add(&pa->u.pa_tmp_list, &list);
    4271             :         }
    4272             : 
    4273             :         /* now free all selected PAs */
    4274           0 :         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
    4275             : 
    4276             :                 /* remove from object (inode or locality group) */
    4277           0 :                 spin_lock(pa->pa_obj_lock);
    4278           0 :                 list_del_rcu(&pa->pa_inode_list);
    4279           0 :                 spin_unlock(pa->pa_obj_lock);
    4280             : 
    4281           0 :                 if (pa->pa_type == MB_GROUP_PA)
    4282           0 :                         ext4_mb_release_group_pa(&e4b, pa);
    4283             :                 else
    4284           0 :                         ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
    4285             : 
    4286           0 :                 list_del(&pa->u.pa_tmp_list);
    4287           0 :                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
    4288             :         }
    4289             : 
    4290           0 :         free_total += free;
    4291             : 
    4292             :         /* if we still need more blocks and some PAs were used, try again */
    4293           0 :         if (free_total < needed && busy) {
    4294           0 :                 ext4_unlock_group(sb, group);
    4295           0 :                 cond_resched();
    4296           0 :                 busy = 0;
    4297           0 :                 goto repeat;
    4298             :         }
    4299           0 :         ext4_unlock_group(sb, group);
    4300           0 :         ext4_mb_unload_buddy(&e4b);
    4301           0 :         put_bh(bitmap_bh);
    4302           0 : out_dbg:
    4303           0 :         mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
    4304             :                  free_total, group, grp->bb_free);
    4305           0 :         return free_total;
    4306             : }
    4307             : 
    4308             : /*
    4309             :  * releases all non-used preallocated blocks for given inode
    4310             :  *
    4311             :  * It's important to discard preallocations under i_data_sem
    4312             :  * We don't want another block to be served from the prealloc
    4313             :  * space when we are discarding the inode prealloc space.
    4314             :  *
    4315             :  * FIXME!! Make sure it is valid at all the call sites
    4316             :  */
    4317         449 : void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
    4318             : {
    4319         449 :         struct ext4_inode_info *ei = EXT4_I(inode);
    4320         449 :         struct super_block *sb = inode->i_sb;
    4321         449 :         struct buffer_head *bitmap_bh = NULL;
    4322         449 :         struct ext4_prealloc_space *pa, *tmp;
    4323         449 :         ext4_group_t group = 0;
    4324         449 :         struct list_head list;
    4325         449 :         struct ext4_buddy e4b;
    4326         449 :         int err;
    4327             : 
    4328         449 :         if (!S_ISREG(inode->i_mode)) {
    4329             :                 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
    4330         142 :                 return;
    4331             :         }
    4332             : 
    4333         307 :         if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
    4334             :                 return;
    4335             : 
    4336         307 :         mb_debug(sb, "discard preallocation for inode %lu\n",
    4337             :                  inode->i_ino);
    4338         614 :         trace_ext4_discard_preallocations(inode,
    4339         307 :                         atomic_read(&ei->i_prealloc_active), needed);
    4340             : 
    4341         307 :         INIT_LIST_HEAD(&list);
    4342             : 
    4343         307 :         if (needed == 0)
    4344         307 :                 needed = UINT_MAX;
    4345             : 
    4346           0 : repeat:
    4347             :         /* first, collect all pa's in the inode */
    4348         307 :         spin_lock(&ei->i_prealloc_lock);
    4349         309 :         while (!list_empty(&ei->i_prealloc_list) && needed) {
    4350           2 :                 pa = list_entry(ei->i_prealloc_list.prev,
    4351             :                                 struct ext4_prealloc_space, pa_inode_list);
    4352           2 :                 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
    4353           2 :                 spin_lock(&pa->pa_lock);
    4354           2 :                 if (atomic_read(&pa->pa_count)) {
    4355             :                         /* this shouldn't happen often - nobody should
    4356             :                          * use preallocation while we're discarding it */
    4357           0 :                         spin_unlock(&pa->pa_lock);
    4358           0 :                         spin_unlock(&ei->i_prealloc_lock);
    4359           0 :                         ext4_msg(sb, KERN_ERR,
    4360             :                                  "uh-oh! used pa while discarding");
    4361           0 :                         WARN_ON(1);
    4362           0 :                         schedule_timeout_uninterruptible(HZ);
    4363           0 :                         goto repeat;
    4364             : 
    4365             :                 }
    4366           2 :                 if (pa->pa_deleted == 0) {
    4367           2 :                         ext4_mb_mark_pa_deleted(sb, pa);
    4368           2 :                         spin_unlock(&pa->pa_lock);
    4369           2 :                         list_del_rcu(&pa->pa_inode_list);
    4370           2 :                         list_add(&pa->u.pa_tmp_list, &list);
    4371           2 :                         needed--;
    4372           2 :                         continue;
    4373             :                 }
    4374             : 
    4375             :                 /* someone is deleting pa right now */
    4376           0 :                 spin_unlock(&pa->pa_lock);
    4377           0 :                 spin_unlock(&ei->i_prealloc_lock);
    4378             : 
    4379             :                 /* we have to wait here because pa_deleted
    4380             :                  * doesn't mean pa is already unlinked from
    4381             :                  * the list. as we might be called from
    4382             :                  * ->clear_inode() the inode will get freed
    4383             :                  * and concurrent thread which is unlinking
    4384             :                  * pa from inode's list may access already
    4385             :                  * freed memory, bad-bad-bad */
    4386             : 
    4387             :                 /* XXX: if this happens too often, we can
    4388             :                  * add a flag to force wait only in case
    4389             :                  * of ->clear_inode(), but not in case of
    4390             :                  * regular truncate */
    4391           0 :                 schedule_timeout_uninterruptible(HZ);
    4392           0 :                 goto repeat;
    4393             :         }
    4394         307 :         spin_unlock(&ei->i_prealloc_lock);
    4395             : 
    4396         309 :         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
    4397           2 :                 BUG_ON(pa->pa_type != MB_INODE_PA);
    4398           2 :                 group = ext4_get_group_number(sb, pa->pa_pstart);
    4399             : 
    4400           2 :                 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
    4401             :                                              GFP_NOFS|__GFP_NOFAIL);
    4402           2 :                 if (err) {
    4403           0 :                         ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
    4404             :                                        err, group);
    4405           0 :                         continue;
    4406             :                 }
    4407             : 
    4408           2 :                 bitmap_bh = ext4_read_block_bitmap(sb, group);
    4409           2 :                 if (IS_ERR(bitmap_bh)) {
    4410           0 :                         err = PTR_ERR(bitmap_bh);
    4411           0 :                         ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
    4412             :                                        err, group);
    4413           0 :                         ext4_mb_unload_buddy(&e4b);
    4414           0 :                         continue;
    4415             :                 }
    4416             : 
    4417           2 :                 ext4_lock_group(sb, group);
    4418           2 :                 list_del(&pa->pa_group_list);
    4419           2 :                 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
    4420           2 :                 ext4_unlock_group(sb, group);
    4421             : 
    4422           2 :                 ext4_mb_unload_buddy(&e4b);
    4423           2 :                 put_bh(bitmap_bh);
    4424             : 
    4425           2 :                 list_del(&pa->u.pa_tmp_list);
    4426           2 :                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
    4427             :         }
    4428             : }
    4429             : 
    4430         197 : static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
    4431             : {
    4432         197 :         struct ext4_prealloc_space *pa;
    4433             : 
    4434         197 :         BUG_ON(ext4_pspace_cachep == NULL);
    4435         197 :         pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
    4436         197 :         if (!pa)
    4437             :                 return -ENOMEM;
    4438         197 :         atomic_set(&pa->pa_count, 1);
    4439         197 :         ac->ac_pa = pa;
    4440         197 :         return 0;
    4441             : }
    4442             : 
    4443         191 : static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
    4444             : {
    4445         191 :         struct ext4_prealloc_space *pa = ac->ac_pa;
    4446             : 
    4447         191 :         BUG_ON(!pa);
    4448         191 :         ac->ac_pa = NULL;
    4449         382 :         WARN_ON(!atomic_dec_and_test(&pa->pa_count));
    4450         191 :         kmem_cache_free(ext4_pspace_cachep, pa);
    4451         191 : }
    4452             : 
    4453             : #ifdef CONFIG_EXT4_DEBUG
    4454             : static inline void ext4_mb_show_pa(struct super_block *sb)
    4455             : {
    4456             :         ext4_group_t i, ngroups;
    4457             : 
    4458             :         if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
    4459             :                 return;
    4460             : 
    4461             :         ngroups = ext4_get_groups_count(sb);
    4462             :         mb_debug(sb, "groups: ");
    4463             :         for (i = 0; i < ngroups; i++) {
    4464             :                 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
    4465             :                 struct ext4_prealloc_space *pa;
    4466             :                 ext4_grpblk_t start;
    4467             :                 struct list_head *cur;
    4468             :                 ext4_lock_group(sb, i);
    4469             :                 list_for_each(cur, &grp->bb_prealloc_list) {
    4470             :                         pa = list_entry(cur, struct ext4_prealloc_space,
    4471             :                                         pa_group_list);
    4472             :                         spin_lock(&pa->pa_lock);
    4473             :                         ext4_get_group_no_and_offset(sb, pa->pa_pstart,
    4474             :                                                      NULL, &start);
    4475             :                         spin_unlock(&pa->pa_lock);
    4476             :                         mb_debug(sb, "PA:%u:%d:%d\n", i, start,
    4477             :                                  pa->pa_len);
    4478             :                 }
    4479             :                 ext4_unlock_group(sb, i);
    4480             :                 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
    4481             :                          grp->bb_fragments);
    4482             :         }
    4483             : }
    4484             : 
    4485             : static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
    4486             : {
    4487             :         struct super_block *sb = ac->ac_sb;
    4488             : 
    4489             :         if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
    4490             :                 return;
    4491             : 
    4492             :         mb_debug(sb, "Can't allocate:"
    4493             :                         " Allocation context details:");
    4494             :         mb_debug(sb, "status %u flags 0x%x",
    4495             :                         ac->ac_status, ac->ac_flags);
    4496             :         mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
    4497             :                         "goal %lu/%lu/%lu@%lu, "
    4498             :                         "best %lu/%lu/%lu@%lu cr %d",
    4499             :                         (unsigned long)ac->ac_o_ex.fe_group,
    4500             :                         (unsigned long)ac->ac_o_ex.fe_start,
    4501             :                         (unsigned long)ac->ac_o_ex.fe_len,
    4502             :                         (unsigned long)ac->ac_o_ex.fe_logical,
    4503             :                         (unsigned long)ac->ac_g_ex.fe_group,
    4504             :                         (unsigned long)ac->ac_g_ex.fe_start,
    4505             :                         (unsigned long)ac->ac_g_ex.fe_len,
    4506             :                         (unsigned long)ac->ac_g_ex.fe_logical,
    4507             :                         (unsigned long)ac->ac_b_ex.fe_group,
    4508             :                         (unsigned long)ac->ac_b_ex.fe_start,
    4509             :                         (unsigned long)ac->ac_b_ex.fe_len,
    4510             :                         (unsigned long)ac->ac_b_ex.fe_logical,
    4511             :                         (int)ac->ac_criteria);
    4512             :         mb_debug(sb, "%u found", ac->ac_found);
    4513             :         ext4_mb_show_pa(sb);
    4514             : }
    4515             : #else
    4516           0 : static inline void ext4_mb_show_pa(struct super_block *sb)
    4517             : {
    4518           0 :         return;
    4519             : }
    4520           0 : static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
    4521             : {
    4522           0 :         ext4_mb_show_pa(ac->ac_sb);
    4523           0 :         return;
    4524             : }
    4525             : #endif
    4526             : 
    4527             : /*
    4528             :  * We use locality group preallocation for small size file. The size of the
    4529             :  * file is determined by the current size or the resulting size after
    4530             :  * allocation which ever is larger
    4531             :  *
    4532             :  * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
    4533             :  */
    4534         245 : static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
    4535             : {
    4536         245 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4537         245 :         int bsbits = ac->ac_sb->s_blocksize_bits;
    4538         245 :         loff_t size, isize;
    4539             : 
    4540         245 :         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
    4541             :                 return;
    4542             : 
    4543          70 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
    4544             :                 return;
    4545             : 
    4546          70 :         size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
    4547          70 :         isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
    4548          70 :                 >> bsbits;
    4549             : 
    4550         140 :         if ((size == isize) && !ext4_fs_is_busy(sbi) &&
    4551          70 :             !inode_is_open_for_write(ac->ac_inode)) {
    4552           9 :                 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
    4553           9 :                 return;
    4554             :         }
    4555             : 
    4556          61 :         if (sbi->s_mb_group_prealloc <= 0) {
    4557           0 :                 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
    4558           0 :                 return;
    4559             :         }
    4560             : 
    4561             :         /* don't use group allocation for large files */
    4562          61 :         size = max(size, isize);
    4563          61 :         if (size > sbi->s_mb_stream_request) {
    4564           9 :                 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
    4565           9 :                 return;
    4566             :         }
    4567             : 
    4568          52 :         BUG_ON(ac->ac_lg != NULL);
    4569             :         /*
    4570             :          * locality group prealloc space are per cpu. The reason for having
    4571             :          * per cpu locality group is to reduce the contention between block
    4572             :          * request from multiple CPUs.
    4573             :          */
    4574          52 :         ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
    4575             : 
    4576             :         /* we're going to use group allocation */
    4577          52 :         ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
    4578             : 
    4579             :         /* serialize all allocations in the group */
    4580          52 :         mutex_lock(&ac->ac_lg->lg_mutex);
    4581             : }
    4582             : 
    4583             : static noinline_for_stack int
    4584         245 : ext4_mb_initialize_context(struct ext4_allocation_context *ac,
    4585             :                                 struct ext4_allocation_request *ar)
    4586             : {
    4587         245 :         struct super_block *sb = ar->inode->i_sb;
    4588         245 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    4589         245 :         struct ext4_super_block *es = sbi->s_es;
    4590         245 :         ext4_group_t group;
    4591         245 :         unsigned int len;
    4592         245 :         ext4_fsblk_t goal;
    4593         245 :         ext4_grpblk_t block;
    4594             : 
    4595             :         /* we can't allocate > group size */
    4596         245 :         len = ar->len;
    4597             : 
    4598             :         /* just a dirty hack to filter too big requests  */
    4599         245 :         if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
    4600           0 :                 len = EXT4_CLUSTERS_PER_GROUP(sb);
    4601             : 
    4602             :         /* start searching from the goal */
    4603         245 :         goal = ar->goal;
    4604         245 :         if (goal < le32_to_cpu(es->s_first_data_block) ||
    4605         245 :                         goal >= ext4_blocks_count(es))
    4606             :                 goal = le32_to_cpu(es->s_first_data_block);
    4607         245 :         ext4_get_group_no_and_offset(sb, goal, &group, &block);
    4608             : 
    4609             :         /* set up allocation goals */
    4610         245 :         ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
    4611         245 :         ac->ac_status = AC_STATUS_CONTINUE;
    4612         245 :         ac->ac_sb = sb;
    4613         245 :         ac->ac_inode = ar->inode;
    4614         245 :         ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
    4615         245 :         ac->ac_o_ex.fe_group = group;
    4616         245 :         ac->ac_o_ex.fe_start = block;
    4617         245 :         ac->ac_o_ex.fe_len = len;
    4618         245 :         ac->ac_g_ex = ac->ac_o_ex;
    4619         245 :         ac->ac_flags = ar->flags;
    4620             : 
    4621             :         /* we have to define context: we'll work with a file or
    4622             :          * locality group. this is a policy, actually */
    4623         245 :         ext4_mb_group_or_file(ac);
    4624             : 
    4625         245 :         mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
    4626             :                         "left: %u/%u, right %u/%u to %swritable\n",
    4627             :                         (unsigned) ar->len, (unsigned) ar->logical,
    4628             :                         (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
    4629             :                         (unsigned) ar->lleft, (unsigned) ar->pleft,
    4630             :                         (unsigned) ar->lright, (unsigned) ar->pright,
    4631             :                         inode_is_open_for_write(ar->inode) ? "" : "non-");
    4632         245 :         return 0;
    4633             : 
    4634             : }
    4635             : 
    4636             : static noinline_for_stack void
    4637           0 : ext4_mb_discard_lg_preallocations(struct super_block *sb,
    4638             :                                         struct ext4_locality_group *lg,
    4639             :                                         int order, int total_entries)
    4640             : {
    4641           0 :         ext4_group_t group = 0;
    4642           0 :         struct ext4_buddy e4b;
    4643           0 :         struct list_head discard_list;
    4644           0 :         struct ext4_prealloc_space *pa, *tmp;
    4645             : 
    4646           0 :         mb_debug(sb, "discard locality group preallocation\n");
    4647             : 
    4648           0 :         INIT_LIST_HEAD(&discard_list);
    4649             : 
    4650           0 :         spin_lock(&lg->lg_prealloc_lock);
    4651           0 :         list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
    4652             :                                 pa_inode_list,
    4653             :                                 lockdep_is_held(&lg->lg_prealloc_lock)) {
    4654           0 :                 spin_lock(&pa->pa_lock);
    4655           0 :                 if (atomic_read(&pa->pa_count)) {
    4656             :                         /*
    4657             :                          * This is the pa that we just used
    4658             :                          * for block allocation. So don't
    4659             :                          * free that
    4660             :                          */
    4661           0 :                         spin_unlock(&pa->pa_lock);
    4662           0 :                         continue;
    4663             :                 }
    4664           0 :                 if (pa->pa_deleted) {
    4665           0 :                         spin_unlock(&pa->pa_lock);
    4666           0 :                         continue;
    4667             :                 }
    4668             :                 /* only lg prealloc space */
    4669           0 :                 BUG_ON(pa->pa_type != MB_GROUP_PA);
    4670             : 
    4671             :                 /* seems this one can be freed ... */
    4672           0 :                 ext4_mb_mark_pa_deleted(sb, pa);
    4673           0 :                 spin_unlock(&pa->pa_lock);
    4674             : 
    4675           0 :                 list_del_rcu(&pa->pa_inode_list);
    4676           0 :                 list_add(&pa->u.pa_tmp_list, &discard_list);
    4677             : 
    4678           0 :                 total_entries--;
    4679           0 :                 if (total_entries <= 5) {
    4680             :                         /*
    4681             :                          * we want to keep only 5 entries
    4682             :                          * allowing it to grow to 8. This
    4683             :                          * mak sure we don't call discard
    4684             :                          * soon for this list.
    4685             :                          */
    4686             :                         break;
    4687             :                 }
    4688             :         }
    4689           0 :         spin_unlock(&lg->lg_prealloc_lock);
    4690             : 
    4691           0 :         list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
    4692           0 :                 int err;
    4693             : 
    4694           0 :                 group = ext4_get_group_number(sb, pa->pa_pstart);
    4695           0 :                 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
    4696             :                                              GFP_NOFS|__GFP_NOFAIL);
    4697           0 :                 if (err) {
    4698           0 :                         ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
    4699             :                                        err, group);
    4700           0 :                         continue;
    4701             :                 }
    4702           0 :                 ext4_lock_group(sb, group);
    4703           0 :                 list_del(&pa->pa_group_list);
    4704           0 :                 ext4_mb_release_group_pa(&e4b, pa);
    4705           0 :                 ext4_unlock_group(sb, group);
    4706             : 
    4707           0 :                 ext4_mb_unload_buddy(&e4b);
    4708           0 :                 list_del(&pa->u.pa_tmp_list);
    4709           0 :                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
    4710             :         }
    4711           0 : }
    4712             : 
    4713             : /*
    4714             :  * We have incremented pa_count. So it cannot be freed at this
    4715             :  * point. Also we hold lg_mutex. So no parallel allocation is
    4716             :  * possible from this lg. That means pa_free cannot be updated.
    4717             :  *
    4718             :  * A parallel ext4_mb_discard_group_preallocations is possible.
    4719             :  * which can cause the lg_prealloc_list to be updated.
    4720             :  */
    4721             : 
    4722          52 : static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
    4723             : {
    4724          52 :         int order, added = 0, lg_prealloc_count = 1;
    4725          52 :         struct super_block *sb = ac->ac_sb;
    4726          52 :         struct ext4_locality_group *lg = ac->ac_lg;
    4727          52 :         struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
    4728             : 
    4729          52 :         order = fls(pa->pa_free) - 1;
    4730          52 :         if (order > PREALLOC_TB_SIZE - 1)
    4731             :                 /* The max size of hash table is PREALLOC_TB_SIZE */
    4732             :                 order = PREALLOC_TB_SIZE - 1;
    4733             :         /* Add the prealloc space to lg */
    4734          52 :         spin_lock(&lg->lg_prealloc_lock);
    4735          52 :         list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
    4736             :                                 pa_inode_list,
    4737             :                                 lockdep_is_held(&lg->lg_prealloc_lock)) {
    4738           0 :                 spin_lock(&tmp_pa->pa_lock);
    4739           0 :                 if (tmp_pa->pa_deleted) {
    4740           0 :                         spin_unlock(&tmp_pa->pa_lock);
    4741           0 :                         continue;
    4742             :                 }
    4743           0 :                 if (!added && pa->pa_free < tmp_pa->pa_free) {
    4744             :                         /* Add to the tail of the previous entry */
    4745           0 :                         list_add_tail_rcu(&pa->pa_inode_list,
    4746             :                                                 &tmp_pa->pa_inode_list);
    4747           0 :                         added = 1;
    4748             :                         /*
    4749             :                          * we want to count the total
    4750             :                          * number of entries in the list
    4751             :                          */
    4752             :                 }
    4753           0 :                 spin_unlock(&tmp_pa->pa_lock);
    4754           0 :                 lg_prealloc_count++;
    4755             :         }
    4756          52 :         if (!added)
    4757          52 :                 list_add_tail_rcu(&pa->pa_inode_list,
    4758             :                                         &lg->lg_prealloc_list[order]);
    4759          52 :         spin_unlock(&lg->lg_prealloc_lock);
    4760             : 
    4761             :         /* Now trim the list to be not more than 8 elements */
    4762          52 :         if (lg_prealloc_count > 8) {
    4763           0 :                 ext4_mb_discard_lg_preallocations(sb, lg,
    4764             :                                                   order, lg_prealloc_count);
    4765           0 :                 return;
    4766             :         }
    4767             :         return ;
    4768             : }
    4769             : 
    4770             : /*
    4771             :  * if per-inode prealloc list is too long, trim some PA
    4772             :  */
    4773         245 : static void ext4_mb_trim_inode_pa(struct inode *inode)
    4774             : {
    4775         245 :         struct ext4_inode_info *ei = EXT4_I(inode);
    4776         245 :         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
    4777         245 :         int count, delta;
    4778             : 
    4779         245 :         count = atomic_read(&ei->i_prealloc_active);
    4780         245 :         delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
    4781         245 :         if (count > sbi->s_mb_max_inode_prealloc + delta) {
    4782           0 :                 count -= sbi->s_mb_max_inode_prealloc;
    4783           0 :                 ext4_discard_preallocations(inode, count);
    4784             :         }
    4785         245 : }
    4786             : 
    4787             : /*
    4788             :  * release all resource we used in allocation
    4789             :  */
    4790         245 : static int ext4_mb_release_context(struct ext4_allocation_context *ac)
    4791             : {
    4792         245 :         struct inode *inode = ac->ac_inode;
    4793         245 :         struct ext4_inode_info *ei = EXT4_I(inode);
    4794         245 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4795         245 :         struct ext4_prealloc_space *pa = ac->ac_pa;
    4796         245 :         if (pa) {
    4797          54 :                 if (pa->pa_type == MB_GROUP_PA) {
    4798             :                         /* see comment in ext4_mb_use_group_pa() */
    4799          52 :                         spin_lock(&pa->pa_lock);
    4800          52 :                         pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    4801          52 :                         pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    4802          52 :                         pa->pa_free -= ac->ac_b_ex.fe_len;
    4803          52 :                         pa->pa_len -= ac->ac_b_ex.fe_len;
    4804          52 :                         spin_unlock(&pa->pa_lock);
    4805             : 
    4806             :                         /*
    4807             :                          * We want to add the pa to the right bucket.
    4808             :                          * Remove it from the list and while adding
    4809             :                          * make sure the list to which we are adding
    4810             :                          * doesn't grow big.
    4811             :                          */
    4812          52 :                         if (likely(pa->pa_free)) {
    4813          52 :                                 spin_lock(pa->pa_obj_lock);
    4814          52 :                                 list_del_rcu(&pa->pa_inode_list);
    4815          52 :                                 spin_unlock(pa->pa_obj_lock);
    4816          52 :                                 ext4_mb_add_n_trim(ac);
    4817             :                         }
    4818             :                 }
    4819             : 
    4820          54 :                 if (pa->pa_type == MB_INODE_PA) {
    4821             :                         /*
    4822             :                          * treat per-inode prealloc list as a lru list, then try
    4823             :                          * to trim the least recently used PA.
    4824             :                          */
    4825           2 :                         spin_lock(pa->pa_obj_lock);
    4826           2 :                         list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
    4827           2 :                         spin_unlock(pa->pa_obj_lock);
    4828             :                 }
    4829             : 
    4830          54 :                 ext4_mb_put_pa(ac, ac->ac_sb, pa);
    4831             :         }
    4832         245 :         if (ac->ac_bitmap_page)
    4833         197 :                 put_page(ac->ac_bitmap_page);
    4834         245 :         if (ac->ac_buddy_page)
    4835         197 :                 put_page(ac->ac_buddy_page);
    4836         245 :         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
    4837          52 :                 mutex_unlock(&ac->ac_lg->lg_mutex);
    4838         245 :         ext4_mb_collect_stats(ac);
    4839         245 :         ext4_mb_trim_inode_pa(inode);
    4840         245 :         return 0;
    4841             : }
    4842             : 
    4843           0 : static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
    4844             : {
    4845           0 :         ext4_group_t i, ngroups = ext4_get_groups_count(sb);
    4846           0 :         int ret;
    4847           0 :         int freed = 0;
    4848             : 
    4849           0 :         trace_ext4_mb_discard_preallocations(sb, needed);
    4850           0 :         for (i = 0; i < ngroups && needed > 0; i++) {
    4851           0 :                 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
    4852           0 :                 freed += ret;
    4853           0 :                 needed -= ret;
    4854             :         }
    4855             : 
    4856           0 :         return freed;
    4857             : }
    4858             : 
    4859           0 : static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
    4860             :                         struct ext4_allocation_context *ac, u64 *seq)
    4861             : {
    4862           0 :         int freed;
    4863           0 :         u64 seq_retry = 0;
    4864           0 :         bool ret = false;
    4865             : 
    4866           0 :         freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
    4867           0 :         if (freed) {
    4868           0 :                 ret = true;
    4869           0 :                 goto out_dbg;
    4870             :         }
    4871           0 :         seq_retry = ext4_get_discard_pa_seq_sum();
    4872           0 :         if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
    4873           0 :                 ac->ac_flags |= EXT4_MB_STRICT_CHECK;
    4874           0 :                 *seq = seq_retry;
    4875           0 :                 ret = true;
    4876             :         }
    4877             : 
    4878           0 : out_dbg:
    4879           0 :         mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
    4880           0 :         return ret;
    4881             : }
    4882             : 
    4883             : static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
    4884             :                                 struct ext4_allocation_request *ar, int *errp);
    4885             : 
    4886             : /*
    4887             :  * Main entry point into mballoc to allocate blocks
    4888             :  * it tries to use preallocation first, then falls back
    4889             :  * to usual allocation
    4890             :  */
    4891         245 : ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
    4892             :                                 struct ext4_allocation_request *ar, int *errp)
    4893             : {
    4894         245 :         struct ext4_allocation_context *ac = NULL;
    4895         245 :         struct ext4_sb_info *sbi;
    4896         245 :         struct super_block *sb;
    4897         245 :         ext4_fsblk_t block = 0;
    4898         245 :         unsigned int inquota = 0;
    4899         245 :         unsigned int reserv_clstrs = 0;
    4900         245 :         u64 seq;
    4901             : 
    4902         245 :         might_sleep();
    4903         245 :         sb = ar->inode->i_sb;
    4904         245 :         sbi = EXT4_SB(sb);
    4905             : 
    4906         245 :         trace_ext4_request_blocks(ar);
    4907         245 :         if (sbi->s_mount_state & EXT4_FC_REPLAY)
    4908           0 :                 return ext4_mb_new_blocks_simple(handle, ar, errp);
    4909             : 
    4910             :         /* Allow to use superuser reservation for quota file */
    4911         245 :         if (ext4_is_quota_file(ar->inode))
    4912           0 :                 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
    4913             : 
    4914         245 :         if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
    4915             :                 /* Without delayed allocation we need to verify
    4916             :                  * there is enough free blocks to do block allocation
    4917             :                  * and verify allocation doesn't exceed the quota limits.
    4918             :                  */
    4919         346 :                 while (ar->len &&
    4920         173 :                         ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
    4921             : 
    4922             :                         /* let others to free the space */
    4923           0 :                         cond_resched();
    4924           0 :                         ar->len = ar->len >> 1;
    4925             :                 }
    4926         173 :                 if (!ar->len) {
    4927           0 :                         ext4_mb_show_pa(sb);
    4928           0 :                         *errp = -ENOSPC;
    4929           0 :                         return 0;
    4930             :                 }
    4931         173 :                 reserv_clstrs = ar->len;
    4932         173 :                 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
    4933         173 :                         dquot_alloc_block_nofail(ar->inode,
    4934           0 :                                                  EXT4_C2B(sbi, ar->len));
    4935             :                 } else {
    4936         346 :                         while (ar->len &&
    4937         173 :                                 dquot_alloc_block(ar->inode,
    4938         173 :                                                   EXT4_C2B(sbi, ar->len))) {
    4939             : 
    4940           0 :                                 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
    4941           0 :                                 ar->len--;
    4942             :                         }
    4943             :                 }
    4944         173 :                 inquota = ar->len;
    4945         173 :                 if (ar->len == 0) {
    4946           0 :                         *errp = -EDQUOT;
    4947           0 :                         goto out;
    4948             :                 }
    4949             :         }
    4950             : 
    4951         245 :         ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
    4952         245 :         if (!ac) {
    4953           0 :                 ar->len = 0;
    4954           0 :                 *errp = -ENOMEM;
    4955           0 :                 goto out;
    4956             :         }
    4957             : 
    4958         245 :         *errp = ext4_mb_initialize_context(ac, ar);
    4959         245 :         if (*errp) {
    4960           0 :                 ar->len = 0;
    4961           0 :                 goto out;
    4962             :         }
    4963             : 
    4964         245 :         ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
    4965         245 :         seq = this_cpu_read(discard_pa_seq);
    4966         245 :         if (!ext4_mb_use_preallocated(ac)) {
    4967         197 :                 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
    4968         197 :                 ext4_mb_normalize_request(ac, ar);
    4969             : 
    4970         197 :                 *errp = ext4_mb_pa_alloc(ac);
    4971         197 :                 if (*errp)
    4972           0 :                         goto errout;
    4973         197 : repeat:
    4974             :                 /* allocate space in core */
    4975         197 :                 *errp = ext4_mb_regular_allocator(ac);
    4976             :                 /*
    4977             :                  * pa allocated above is added to grp->bb_prealloc_list only
    4978             :                  * when we were able to allocate some block i.e. when
    4979             :                  * ac->ac_status == AC_STATUS_FOUND.
    4980             :                  * And error from above mean ac->ac_status != AC_STATUS_FOUND
    4981             :                  * So we have to free this pa here itself.
    4982             :                  */
    4983         197 :                 if (*errp) {
    4984           0 :                         ext4_mb_pa_free(ac);
    4985           0 :                         ext4_discard_allocated_blocks(ac);
    4986           0 :                         goto errout;
    4987             :                 }
    4988         197 :                 if (ac->ac_status == AC_STATUS_FOUND &&
    4989         197 :                         ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
    4990         191 :                         ext4_mb_pa_free(ac);
    4991             :         }
    4992         245 :         if (likely(ac->ac_status == AC_STATUS_FOUND)) {
    4993         245 :                 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
    4994         245 :                 if (*errp) {
    4995           0 :                         ext4_discard_allocated_blocks(ac);
    4996           0 :                         goto errout;
    4997             :                 } else {
    4998         245 :                         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    4999         245 :                         ar->len = ac->ac_b_ex.fe_len;
    5000             :                 }
    5001             :         } else {
    5002           0 :                 if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
    5003           0 :                         goto repeat;
    5004             :                 /*
    5005             :                  * If block allocation fails then the pa allocated above
    5006             :                  * needs to be freed here itself.
    5007             :                  */
    5008           0 :                 ext4_mb_pa_free(ac);
    5009           0 :                 *errp = -ENOSPC;
    5010             :         }
    5011             : 
    5012         245 : errout:
    5013         245 :         if (*errp) {
    5014           0 :                 ac->ac_b_ex.fe_len = 0;
    5015           0 :                 ar->len = 0;
    5016           0 :                 ext4_mb_show_ac(ac);
    5017             :         }
    5018         245 :         ext4_mb_release_context(ac);
    5019         245 : out:
    5020         245 :         if (ac)
    5021         245 :                 kmem_cache_free(ext4_ac_cachep, ac);
    5022         245 :         if (inquota && ar->len < inquota)
    5023           0 :                 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
    5024         245 :         if (!ar->len) {
    5025           0 :                 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
    5026             :                         /* release all the reserved blocks if non delalloc */
    5027           0 :                         percpu_counter_sub(&sbi->s_dirtyclusters_counter,
    5028             :                                                 reserv_clstrs);
    5029             :         }
    5030             : 
    5031         245 :         trace_ext4_allocate_blocks(ar, (unsigned long long)block);
    5032             : 
    5033         245 :         return block;
    5034             : }
    5035             : 
    5036             : /*
    5037             :  * We can merge two free data extents only if the physical blocks
    5038             :  * are contiguous, AND the extents were freed by the same transaction,
    5039             :  * AND the blocks are associated with the same group.
    5040             :  */
    5041         185 : static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
    5042             :                                         struct ext4_free_data *entry,
    5043             :                                         struct ext4_free_data *new_entry,
    5044             :                                         struct rb_root *entry_rb_root)
    5045             : {
    5046         185 :         if ((entry->efd_tid != new_entry->efd_tid) ||
    5047         185 :             (entry->efd_group != new_entry->efd_group))
    5048             :                 return;
    5049         185 :         if (entry->efd_start_cluster + entry->efd_count ==
    5050         185 :             new_entry->efd_start_cluster) {
    5051          30 :                 new_entry->efd_start_cluster = entry->efd_start_cluster;
    5052          30 :                 new_entry->efd_count += entry->efd_count;
    5053         155 :         } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
    5054             :                    entry->efd_start_cluster) {
    5055          12 :                 new_entry->efd_count += entry->efd_count;
    5056             :         } else
    5057             :                 return;
    5058          42 :         spin_lock(&sbi->s_md_lock);
    5059          42 :         list_del(&entry->efd_list);
    5060          42 :         spin_unlock(&sbi->s_md_lock);
    5061          42 :         rb_erase(&entry->efd_node, entry_rb_root);
    5062          42 :         kmem_cache_free(ext4_free_data_cachep, entry);
    5063             : }
    5064             : 
    5065             : static noinline_for_stack int
    5066         206 : ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
    5067             :                       struct ext4_free_data *new_entry)
    5068             : {
    5069         206 :         ext4_group_t group = e4b->bd_group;
    5070         206 :         ext4_grpblk_t cluster;
    5071         206 :         ext4_grpblk_t clusters = new_entry->efd_count;
    5072         206 :         struct ext4_free_data *entry;
    5073         206 :         struct ext4_group_info *db = e4b->bd_info;
    5074         206 :         struct super_block *sb = e4b->bd_sb;
    5075         206 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    5076         206 :         struct rb_node **n = &db->bb_free_root.rb_node, *node;
    5077         206 :         struct rb_node *parent = NULL, *new_node;
    5078             : 
    5079         206 :         BUG_ON(!ext4_handle_valid(handle));
    5080         206 :         BUG_ON(e4b->bd_bitmap_page == NULL);
    5081         206 :         BUG_ON(e4b->bd_buddy_page == NULL);
    5082             : 
    5083         206 :         new_node = &new_entry->efd_node;
    5084         206 :         cluster = new_entry->efd_start_cluster;
    5085             : 
    5086         206 :         if (!*n) {
    5087             :                 /* first free block exent. We need to
    5088             :                    protect buddy cache from being freed,
    5089             :                  * otherwise we'll refresh it from
    5090             :                  * on-disk bitmap and lose not-yet-available
    5091             :                  * blocks */
    5092          79 :                 get_page(e4b->bd_buddy_page);
    5093          79 :                 get_page(e4b->bd_bitmap_page);
    5094             :         }
    5095         503 :         while (*n) {
    5096         297 :                 parent = *n;
    5097         297 :                 entry = rb_entry(parent, struct ext4_free_data, efd_node);
    5098         297 :                 if (cluster < entry->efd_start_cluster)
    5099          84 :                         n = &(*n)->rb_left;
    5100         213 :                 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
    5101         213 :                         n = &(*n)->rb_right;
    5102             :                 else {
    5103           0 :                         ext4_grp_locked_error(sb, group, 0,
    5104             :                                 ext4_group_first_block_no(sb, group) +
    5105             :                                 EXT4_C2B(sbi, cluster),
    5106             :                                 "Block already on to-be-freed list");
    5107           0 :                         kmem_cache_free(ext4_free_data_cachep, new_entry);
    5108           0 :                         return 0;
    5109             :                 }
    5110             :         }
    5111             : 
    5112         206 :         rb_link_node(new_node, parent, n);
    5113         206 :         rb_insert_color(new_node, &db->bb_free_root);
    5114             : 
    5115             :         /* Now try to see the extent can be merged to left and right */
    5116         206 :         node = rb_prev(new_node);
    5117         206 :         if (node) {
    5118         108 :                 entry = rb_entry(node, struct ext4_free_data, efd_node);
    5119         108 :                 ext4_try_merge_freed_extent(sbi, entry, new_entry,
    5120             :                                             &(db->bb_free_root));
    5121             :         }
    5122             : 
    5123         206 :         node = rb_next(new_node);
    5124         206 :         if (node) {
    5125          77 :                 entry = rb_entry(node, struct ext4_free_data, efd_node);
    5126          77 :                 ext4_try_merge_freed_extent(sbi, entry, new_entry,
    5127             :                                             &(db->bb_free_root));
    5128             :         }
    5129             : 
    5130         206 :         spin_lock(&sbi->s_md_lock);
    5131         206 :         list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
    5132         206 :         sbi->s_mb_free_pending += clusters;
    5133         206 :         spin_unlock(&sbi->s_md_lock);
    5134         206 :         return 0;
    5135             : }
    5136             : 
    5137             : /*
    5138             :  * Simple allocator for Ext4 fast commit replay path. It searches for blocks
    5139             :  * linearly starting at the goal block and also excludes the blocks which
    5140             :  * are going to be in use after fast commit replay.
    5141             :  */
    5142           0 : static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
    5143             :                                 struct ext4_allocation_request *ar, int *errp)
    5144             : {
    5145           0 :         struct buffer_head *bitmap_bh;
    5146           0 :         struct super_block *sb = ar->inode->i_sb;
    5147           0 :         ext4_group_t group;
    5148           0 :         ext4_grpblk_t blkoff;
    5149           0 :         int i = sb->s_blocksize;
    5150           0 :         ext4_fsblk_t goal, block;
    5151           0 :         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
    5152             : 
    5153           0 :         goal = ar->goal;
    5154           0 :         if (goal < le32_to_cpu(es->s_first_data_block) ||
    5155           0 :                         goal >= ext4_blocks_count(es))
    5156             :                 goal = le32_to_cpu(es->s_first_data_block);
    5157             : 
    5158           0 :         ar->len = 0;
    5159           0 :         ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
    5160           0 :         for (; group < ext4_get_groups_count(sb); group++) {
    5161           0 :                 bitmap_bh = ext4_read_block_bitmap(sb, group);
    5162           0 :                 if (IS_ERR(bitmap_bh)) {
    5163           0 :                         *errp = PTR_ERR(bitmap_bh);
    5164           0 :                         pr_warn("Failed to read block bitmap\n");
    5165           0 :                         return 0;
    5166             :                 }
    5167             : 
    5168           0 :                 ext4_get_group_no_and_offset(sb,
    5169           0 :                         max(ext4_group_first_block_no(sb, group), goal),
    5170             :                         NULL, &blkoff);
    5171           0 :                 i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize,
    5172             :                                                 blkoff);
    5173           0 :                 brelse(bitmap_bh);
    5174           0 :                 if (i >= sb->s_blocksize)
    5175           0 :                         continue;
    5176           0 :                 if (ext4_fc_replay_check_excluded(sb,
    5177           0 :                         ext4_group_first_block_no(sb, group) + i))
    5178           0 :                         continue;
    5179             :                 break;
    5180             :         }
    5181             : 
    5182           0 :         if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize)
    5183             :                 return 0;
    5184             : 
    5185           0 :         block = ext4_group_first_block_no(sb, group) + i;
    5186           0 :         ext4_mb_mark_bb(sb, block, 1, 1);
    5187           0 :         ar->len = 1;
    5188             : 
    5189           0 :         return block;
    5190             : }
    5191             : 
    5192           0 : static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
    5193             :                                         unsigned long count)
    5194             : {
    5195           0 :         struct buffer_head *bitmap_bh;
    5196           0 :         struct super_block *sb = inode->i_sb;
    5197           0 :         struct ext4_group_desc *gdp;
    5198           0 :         struct buffer_head *gdp_bh;
    5199           0 :         ext4_group_t group;
    5200           0 :         ext4_grpblk_t blkoff;
    5201           0 :         int already_freed = 0, err, i;
    5202             : 
    5203           0 :         ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
    5204           0 :         bitmap_bh = ext4_read_block_bitmap(sb, group);
    5205           0 :         if (IS_ERR(bitmap_bh)) {
    5206           0 :                 err = PTR_ERR(bitmap_bh);
    5207           0 :                 pr_warn("Failed to read block bitmap\n");
    5208           0 :                 return;
    5209             :         }
    5210           0 :         gdp = ext4_get_group_desc(sb, group, &gdp_bh);
    5211           0 :         if (!gdp)
    5212             :                 return;
    5213             : 
    5214           0 :         for (i = 0; i < count; i++) {
    5215           0 :                 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
    5216           0 :                         already_freed++;
    5217             :         }
    5218           0 :         mb_clear_bits(bitmap_bh->b_data, blkoff, count);
    5219           0 :         err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
    5220           0 :         if (err)
    5221             :                 return;
    5222           0 :         ext4_free_group_clusters_set(
    5223           0 :                 sb, gdp, ext4_free_group_clusters(sb, gdp) +
    5224             :                 count - already_freed);
    5225           0 :         ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
    5226           0 :         ext4_group_desc_csum_set(sb, group, gdp);
    5227           0 :         ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
    5228           0 :         sync_dirty_buffer(bitmap_bh);
    5229           0 :         sync_dirty_buffer(gdp_bh);
    5230           0 :         brelse(bitmap_bh);
    5231             : }
    5232             : 
    5233             : /**
    5234             :  * ext4_free_blocks() -- Free given blocks and update quota
    5235             :  * @handle:             handle for this transaction
    5236             :  * @inode:              inode
    5237             :  * @bh:                 optional buffer of the block to be freed
    5238             :  * @block:              starting physical block to be freed
    5239             :  * @count:              number of blocks to be freed
    5240             :  * @flags:              flags used by ext4_free_blocks
    5241             :  */
    5242         206 : void ext4_free_blocks(handle_t *handle, struct inode *inode,
    5243             :                       struct buffer_head *bh, ext4_fsblk_t block,
    5244             :                       unsigned long count, int flags)
    5245             : {
    5246         206 :         struct buffer_head *bitmap_bh = NULL;
    5247         206 :         struct super_block *sb = inode->i_sb;
    5248         206 :         struct ext4_group_desc *gdp;
    5249         206 :         unsigned int overflow;
    5250         206 :         ext4_grpblk_t bit;
    5251         206 :         struct buffer_head *gd_bh;
    5252         206 :         ext4_group_t block_group;
    5253         206 :         struct ext4_sb_info *sbi;
    5254         206 :         struct ext4_buddy e4b;
    5255         206 :         unsigned int count_clusters;
    5256         206 :         int err = 0;
    5257         206 :         int ret;
    5258             : 
    5259         206 :         sbi = EXT4_SB(sb);
    5260             : 
    5261         206 :         if (sbi->s_mount_state & EXT4_FC_REPLAY) {
    5262           0 :                 ext4_free_blocks_simple(inode, block, count);
    5263           0 :                 return;
    5264             :         }
    5265             : 
    5266         206 :         might_sleep();
    5267         206 :         if (bh) {
    5268           0 :                 if (block)
    5269           0 :                         BUG_ON(block != bh->b_blocknr);
    5270             :                 else
    5271           0 :                         block = bh->b_blocknr;
    5272             :         }
    5273             : 
    5274         412 :         if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
    5275         206 :             !ext4_inode_block_valid(inode, block, count)) {
    5276           0 :                 ext4_error(sb, "Freeing blocks not in datazone - "
    5277             :                            "block = %llu, count = %lu", block, count);
    5278           0 :                 goto error_return;
    5279             :         }
    5280             : 
    5281         206 :         ext4_debug("freeing block %llu\n", block);
    5282         206 :         trace_ext4_free_blocks(inode, block, count, flags);
    5283             : 
    5284         206 :         if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
    5285           0 :                 BUG_ON(count > 1);
    5286             : 
    5287           0 :                 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
    5288             :                             inode, bh, block);
    5289             :         }
    5290             : 
    5291             :         /*
    5292             :          * If the extent to be freed does not begin on a cluster
    5293             :          * boundary, we need to deal with partial clusters at the
    5294             :          * beginning and end of the extent.  Normally we will free
    5295             :          * blocks at the beginning or the end unless we are explicitly
    5296             :          * requested to avoid doing so.
    5297             :          */
    5298         206 :         overflow = EXT4_PBLK_COFF(sbi, block);
    5299         206 :         if (overflow) {
    5300           0 :                 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
    5301           0 :                         overflow = sbi->s_cluster_ratio - overflow;
    5302           0 :                         block += overflow;
    5303           0 :                         if (count > overflow)
    5304           0 :                                 count -= overflow;
    5305             :                         else
    5306             :                                 return;
    5307             :                 } else {
    5308           0 :                         block -= overflow;
    5309           0 :                         count += overflow;
    5310             :                 }
    5311             :         }
    5312         206 :         overflow = EXT4_LBLK_COFF(sbi, count);
    5313         206 :         if (overflow) {
    5314           0 :                 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
    5315           0 :                         if (count > overflow)
    5316           0 :                                 count -= overflow;
    5317             :                         else
    5318             :                                 return;
    5319             :                 } else
    5320           0 :                         count += sbi->s_cluster_ratio - overflow;
    5321             :         }
    5322             : 
    5323         206 :         if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
    5324          77 :                 int i;
    5325          77 :                 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
    5326             : 
    5327         154 :                 for (i = 0; i < count; i++) {
    5328          77 :                         cond_resched();
    5329          77 :                         if (is_metadata)
    5330          77 :                                 bh = sb_find_get_block(inode->i_sb, block + i);
    5331          77 :                         ext4_forget(handle, is_metadata, inode, bh, block + i);
    5332             :                 }
    5333             :         }
    5334             : 
    5335         206 : do_more:
    5336         206 :         overflow = 0;
    5337         206 :         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
    5338             : 
    5339         206 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
    5340             :                         ext4_get_group_info(sb, block_group))))
    5341             :                 return;
    5342             : 
    5343             :         /*
    5344             :          * Check to see if we are freeing blocks across a group
    5345             :          * boundary.
    5346             :          */
    5347         206 :         if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
    5348           0 :                 overflow = EXT4_C2B(sbi, bit) + count -
    5349           0 :                         EXT4_BLOCKS_PER_GROUP(sb);
    5350           0 :                 count -= overflow;
    5351             :         }
    5352         206 :         count_clusters = EXT4_NUM_B2C(sbi, count);
    5353         206 :         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
    5354         206 :         if (IS_ERR(bitmap_bh)) {
    5355           0 :                 err = PTR_ERR(bitmap_bh);
    5356           0 :                 bitmap_bh = NULL;
    5357           0 :                 goto error_return;
    5358             :         }
    5359         206 :         gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
    5360         206 :         if (!gdp) {
    5361           0 :                 err = -EIO;
    5362           0 :                 goto error_return;
    5363             :         }
    5364             : 
    5365         412 :         if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
    5366         412 :             in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
    5367         412 :             in_range(block, ext4_inode_table(sb, gdp),
    5368         206 :                      sbi->s_itb_per_group) ||
    5369         412 :             in_range(block + count - 1, ext4_inode_table(sb, gdp),
    5370             :                      sbi->s_itb_per_group)) {
    5371             : 
    5372           0 :                 ext4_error(sb, "Freeing blocks in system zone - "
    5373             :                            "Block = %llu, count = %lu", block, count);
    5374             :                 /* err = 0. ext4_std_error should be a no op */
    5375           0 :                 goto error_return;
    5376             :         }
    5377             : 
    5378         206 :         BUFFER_TRACE(bitmap_bh, "getting write access");
    5379         206 :         err = ext4_journal_get_write_access(handle, bitmap_bh);
    5380         206 :         if (err)
    5381           0 :                 goto error_return;
    5382             : 
    5383             :         /*
    5384             :          * We are about to modify some metadata.  Call the journal APIs
    5385             :          * to unshare ->b_data if a currently-committing transaction is
    5386             :          * using it
    5387             :          */
    5388         206 :         BUFFER_TRACE(gd_bh, "get_write_access");
    5389         206 :         err = ext4_journal_get_write_access(handle, gd_bh);
    5390         206 :         if (err)
    5391           0 :                 goto error_return;
    5392             : #ifdef AGGRESSIVE_CHECK
    5393             :         {
    5394             :                 int i;
    5395             :                 for (i = 0; i < count_clusters; i++)
    5396             :                         BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
    5397             :         }
    5398             : #endif
    5399         206 :         trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
    5400             : 
    5401             :         /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
    5402         206 :         err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
    5403             :                                      GFP_NOFS|__GFP_NOFAIL);
    5404         206 :         if (err)
    5405           0 :                 goto error_return;
    5406             : 
    5407             :         /*
    5408             :          * We need to make sure we don't reuse the freed block until after the
    5409             :          * transaction is committed. We make an exception if the inode is to be
    5410             :          * written in writeback mode since writeback mode has weak data
    5411             :          * consistency guarantees.
    5412             :          */
    5413         206 :         if (ext4_handle_valid(handle) &&
    5414         206 :             ((flags & EXT4_FREE_BLOCKS_METADATA) ||
    5415         335 :              !ext4_should_writeback_data(inode))) {
    5416         206 :                 struct ext4_free_data *new_entry;
    5417             :                 /*
    5418             :                  * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
    5419             :                  * to fail.
    5420             :                  */
    5421         206 :                 new_entry = kmem_cache_alloc(ext4_free_data_cachep,
    5422             :                                 GFP_NOFS|__GFP_NOFAIL);
    5423         206 :                 new_entry->efd_start_cluster = bit;
    5424         206 :                 new_entry->efd_group = block_group;
    5425         206 :                 new_entry->efd_count = count_clusters;
    5426         206 :                 new_entry->efd_tid = handle->h_transaction->t_tid;
    5427             : 
    5428         206 :                 ext4_lock_group(sb, block_group);
    5429         206 :                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
    5430         206 :                 ext4_mb_free_metadata(handle, &e4b, new_entry);
    5431             :         } else {
    5432             :                 /* need to update group_info->bb_free and bitmap
    5433             :                  * with group lock held. generate_buddy look at
    5434             :                  * them with group lock_held
    5435             :                  */
    5436           0 :                 if (test_opt(sb, DISCARD)) {
    5437           0 :                         err = ext4_issue_discard(sb, block_group, bit, count,
    5438             :                                                  NULL);
    5439           0 :                         if (err && err != -EOPNOTSUPP)
    5440           0 :                                 ext4_msg(sb, KERN_WARNING, "discard request in"
    5441             :                                          " group:%d block:%d count:%lu failed"
    5442             :                                          " with %d", block_group, bit, count,
    5443             :                                          err);
    5444             :                 } else
    5445           0 :                         EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
    5446             : 
    5447           0 :                 ext4_lock_group(sb, block_group);
    5448           0 :                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
    5449           0 :                 mb_free_blocks(inode, &e4b, bit, count_clusters);
    5450             :         }
    5451             : 
    5452         206 :         ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
    5453         206 :         ext4_free_group_clusters_set(sb, gdp, ret);
    5454         206 :         ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
    5455         206 :         ext4_group_desc_csum_set(sb, block_group, gdp);
    5456         206 :         ext4_unlock_group(sb, block_group);
    5457             : 
    5458         206 :         if (sbi->s_log_groups_per_flex) {
    5459         206 :                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
    5460         618 :                 atomic64_add(count_clusters,
    5461         412 :                              &sbi_array_rcu_deref(sbi, s_flex_groups,
    5462             :                                                   flex_group)->free_clusters);
    5463             :         }
    5464             : 
    5465             :         /*
    5466             :          * on a bigalloc file system, defer the s_freeclusters_counter
    5467             :          * update to the caller (ext4_remove_space and friends) so they
    5468             :          * can determine if a cluster freed here should be rereserved
    5469             :          */
    5470         206 :         if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
    5471         206 :                 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
    5472         206 :                         dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
    5473         206 :                 percpu_counter_add(&sbi->s_freeclusters_counter,
    5474             :                                    count_clusters);
    5475             :         }
    5476             : 
    5477         206 :         ext4_mb_unload_buddy(&e4b);
    5478             : 
    5479             :         /* We dirtied the bitmap block */
    5480         206 :         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
    5481         206 :         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    5482             : 
    5483             :         /* And the group descriptor block */
    5484         206 :         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
    5485         206 :         ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
    5486         206 :         if (!err)
    5487         206 :                 err = ret;
    5488             : 
    5489         206 :         if (overflow && !err) {
    5490           0 :                 block += count;
    5491           0 :                 count = overflow;
    5492           0 :                 put_bh(bitmap_bh);
    5493           0 :                 goto do_more;
    5494             :         }
    5495         206 : error_return:
    5496         206 :         brelse(bitmap_bh);
    5497         206 :         ext4_std_error(sb, err);
    5498             :         return;
    5499             : }
    5500             : 
    5501             : /**
    5502             :  * ext4_group_add_blocks() -- Add given blocks to an existing group
    5503             :  * @handle:                     handle to this transaction
    5504             :  * @sb:                         super block
    5505             :  * @block:                      start physical block to add to the block group
    5506             :  * @count:                      number of blocks to free
    5507             :  *
    5508             :  * This marks the blocks as free in the bitmap and buddy.
    5509             :  */
    5510           0 : int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
    5511             :                          ext4_fsblk_t block, unsigned long count)
    5512             : {
    5513           0 :         struct buffer_head *bitmap_bh = NULL;
    5514           0 :         struct buffer_head *gd_bh;
    5515           0 :         ext4_group_t block_group;
    5516           0 :         ext4_grpblk_t bit;
    5517           0 :         unsigned int i;
    5518           0 :         struct ext4_group_desc *desc;
    5519           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    5520           0 :         struct ext4_buddy e4b;
    5521           0 :         int err = 0, ret, free_clusters_count;
    5522           0 :         ext4_grpblk_t clusters_freed;
    5523           0 :         ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
    5524           0 :         ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
    5525           0 :         unsigned long cluster_count = last_cluster - first_cluster + 1;
    5526             : 
    5527           0 :         ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
    5528             : 
    5529           0 :         if (count == 0)
    5530             :                 return 0;
    5531             : 
    5532           0 :         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
    5533             :         /*
    5534             :          * Check to see if we are freeing blocks across a group
    5535             :          * boundary.
    5536             :          */
    5537           0 :         if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
    5538           0 :                 ext4_warning(sb, "too many blocks added to group %u",
    5539             :                              block_group);
    5540           0 :                 err = -EINVAL;
    5541           0 :                 goto error_return;
    5542             :         }
    5543             : 
    5544           0 :         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
    5545           0 :         if (IS_ERR(bitmap_bh)) {
    5546           0 :                 err = PTR_ERR(bitmap_bh);
    5547           0 :                 bitmap_bh = NULL;
    5548           0 :                 goto error_return;
    5549             :         }
    5550             : 
    5551           0 :         desc = ext4_get_group_desc(sb, block_group, &gd_bh);
    5552           0 :         if (!desc) {
    5553           0 :                 err = -EIO;
    5554           0 :                 goto error_return;
    5555             :         }
    5556             : 
    5557           0 :         if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
    5558           0 :             in_range(ext4_inode_bitmap(sb, desc), block, count) ||
    5559           0 :             in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
    5560           0 :             in_range(block + count - 1, ext4_inode_table(sb, desc),
    5561             :                      sbi->s_itb_per_group)) {
    5562           0 :                 ext4_error(sb, "Adding blocks in system zones - "
    5563             :                            "Block = %llu, count = %lu",
    5564             :                            block, count);
    5565           0 :                 err = -EINVAL;
    5566           0 :                 goto error_return;
    5567             :         }
    5568             : 
    5569           0 :         BUFFER_TRACE(bitmap_bh, "getting write access");
    5570           0 :         err = ext4_journal_get_write_access(handle, bitmap_bh);
    5571           0 :         if (err)
    5572           0 :                 goto error_return;
    5573             : 
    5574             :         /*
    5575             :          * We are about to modify some metadata.  Call the journal APIs
    5576             :          * to unshare ->b_data if a currently-committing transaction is
    5577             :          * using it
    5578             :          */
    5579           0 :         BUFFER_TRACE(gd_bh, "get_write_access");
    5580           0 :         err = ext4_journal_get_write_access(handle, gd_bh);
    5581           0 :         if (err)
    5582           0 :                 goto error_return;
    5583             : 
    5584           0 :         for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
    5585           0 :                 BUFFER_TRACE(bitmap_bh, "clear bit");
    5586           0 :                 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
    5587           0 :                         ext4_error(sb, "bit already cleared for block %llu",
    5588             :                                    (ext4_fsblk_t)(block + i));
    5589           0 :                         BUFFER_TRACE(bitmap_bh, "bit already cleared");
    5590             :                 } else {
    5591           0 :                         clusters_freed++;
    5592             :                 }
    5593             :         }
    5594             : 
    5595           0 :         err = ext4_mb_load_buddy(sb, block_group, &e4b);
    5596           0 :         if (err)
    5597           0 :                 goto error_return;
    5598             : 
    5599             :         /*
    5600             :          * need to update group_info->bb_free and bitmap
    5601             :          * with group lock held. generate_buddy look at
    5602             :          * them with group lock_held
    5603             :          */
    5604           0 :         ext4_lock_group(sb, block_group);
    5605           0 :         mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
    5606           0 :         mb_free_blocks(NULL, &e4b, bit, cluster_count);
    5607           0 :         free_clusters_count = clusters_freed +
    5608           0 :                 ext4_free_group_clusters(sb, desc);
    5609           0 :         ext4_free_group_clusters_set(sb, desc, free_clusters_count);
    5610           0 :         ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
    5611           0 :         ext4_group_desc_csum_set(sb, block_group, desc);
    5612           0 :         ext4_unlock_group(sb, block_group);
    5613           0 :         percpu_counter_add(&sbi->s_freeclusters_counter,
    5614             :                            clusters_freed);
    5615             : 
    5616           0 :         if (sbi->s_log_groups_per_flex) {
    5617           0 :                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
    5618           0 :                 atomic64_add(clusters_freed,
    5619           0 :                              &sbi_array_rcu_deref(sbi, s_flex_groups,
    5620             :                                                   flex_group)->free_clusters);
    5621             :         }
    5622             : 
    5623           0 :         ext4_mb_unload_buddy(&e4b);
    5624             : 
    5625             :         /* We dirtied the bitmap block */
    5626           0 :         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
    5627           0 :         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    5628             : 
    5629             :         /* And the group descriptor block */
    5630           0 :         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
    5631           0 :         ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
    5632           0 :         if (!err)
    5633           0 :                 err = ret;
    5634             : 
    5635           0 : error_return:
    5636           0 :         brelse(bitmap_bh);
    5637           0 :         ext4_std_error(sb, err);
    5638             :         return err;
    5639             : }
    5640             : 
    5641             : /**
    5642             :  * ext4_trim_extent -- function to TRIM one single free extent in the group
    5643             :  * @sb:         super block for the file system
    5644             :  * @start:      starting block of the free extent in the alloc. group
    5645             :  * @count:      number of blocks to TRIM
    5646             :  * @group:      alloc. group we are working with
    5647             :  * @e4b:        ext4 buddy for the group
    5648             :  *
    5649             :  * Trim "count" blocks starting at "start" in the "group". To assure that no
    5650             :  * one will allocate those blocks, mark it as used in buddy bitmap. This must
    5651             :  * be called with under the group lock.
    5652             :  */
    5653           0 : static int ext4_trim_extent(struct super_block *sb, int start, int count,
    5654             :                              ext4_group_t group, struct ext4_buddy *e4b)
    5655             : __releases(bitlock)
    5656             : __acquires(bitlock)
    5657             : {
    5658           0 :         struct ext4_free_extent ex;
    5659           0 :         int ret = 0;
    5660             : 
    5661           0 :         trace_ext4_trim_extent(sb, group, start, count);
    5662             : 
    5663           0 :         assert_spin_locked(ext4_group_lock_ptr(sb, group));
    5664             : 
    5665           0 :         ex.fe_start = start;
    5666           0 :         ex.fe_group = group;
    5667           0 :         ex.fe_len = count;
    5668             : 
    5669             :         /*
    5670             :          * Mark blocks used, so no one can reuse them while
    5671             :          * being trimmed.
    5672             :          */
    5673           0 :         mb_mark_used(e4b, &ex);
    5674           0 :         ext4_unlock_group(sb, group);
    5675           0 :         ret = ext4_issue_discard(sb, group, start, count, NULL);
    5676           0 :         ext4_lock_group(sb, group);
    5677           0 :         mb_free_blocks(NULL, e4b, start, ex.fe_len);
    5678           0 :         return ret;
    5679             : }
    5680             : 
    5681             : /**
    5682             :  * ext4_trim_all_free -- function to trim all free space in alloc. group
    5683             :  * @sb:                 super block for file system
    5684             :  * @group:              group to be trimmed
    5685             :  * @start:              first group block to examine
    5686             :  * @max:                last group block to examine
    5687             :  * @minblocks:          minimum extent block count
    5688             :  *
    5689             :  * ext4_trim_all_free walks through group's buddy bitmap searching for free
    5690             :  * extents. When the free block is found, ext4_trim_extent is called to TRIM
    5691             :  * the extent.
    5692             :  *
    5693             :  *
    5694             :  * ext4_trim_all_free walks through group's block bitmap searching for free
    5695             :  * extents. When the free extent is found, mark it as used in group buddy
    5696             :  * bitmap. Then issue a TRIM command on this extent and free the extent in
    5697             :  * the group buddy bitmap. This is done until whole group is scanned.
    5698             :  */
    5699             : static ext4_grpblk_t
    5700           0 : ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
    5701             :                    ext4_grpblk_t start, ext4_grpblk_t max,
    5702             :                    ext4_grpblk_t minblocks)
    5703             : {
    5704           0 :         void *bitmap;
    5705           0 :         ext4_grpblk_t next, count = 0, free_count = 0;
    5706           0 :         struct ext4_buddy e4b;
    5707           0 :         int ret = 0;
    5708             : 
    5709           0 :         trace_ext4_trim_all_free(sb, group, start, max);
    5710             : 
    5711           0 :         ret = ext4_mb_load_buddy(sb, group, &e4b);
    5712           0 :         if (ret) {
    5713           0 :                 ext4_warning(sb, "Error %d loading buddy information for %u",
    5714             :                              ret, group);
    5715           0 :                 return ret;
    5716             :         }
    5717           0 :         bitmap = e4b.bd_bitmap;
    5718             : 
    5719           0 :         ext4_lock_group(sb, group);
    5720           0 :         if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
    5721           0 :             minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
    5722           0 :                 goto out;
    5723             : 
    5724           0 :         start = (e4b.bd_info->bb_first_free > start) ?
    5725             :                 e4b.bd_info->bb_first_free : start;
    5726             : 
    5727           0 :         while (start <= max) {
    5728           0 :                 start = mb_find_next_zero_bit(bitmap, max + 1, start);
    5729           0 :                 if (start > max)
    5730             :                         break;
    5731           0 :                 next = mb_find_next_bit(bitmap, max + 1, start);
    5732             : 
    5733           0 :                 if ((next - start) >= minblocks) {
    5734           0 :                         ret = ext4_trim_extent(sb, start,
    5735             :                                                next - start, group, &e4b);
    5736           0 :                         if (ret && ret != -EOPNOTSUPP)
    5737             :                                 break;
    5738           0 :                         ret = 0;
    5739           0 :                         count += next - start;
    5740             :                 }
    5741           0 :                 free_count += next - start;
    5742           0 :                 start = next + 1;
    5743             : 
    5744           0 :                 if (fatal_signal_pending(current)) {
    5745             :                         count = -ERESTARTSYS;
    5746             :                         break;
    5747             :                 }
    5748             : 
    5749           0 :                 if (need_resched()) {
    5750           0 :                         ext4_unlock_group(sb, group);
    5751           0 :                         cond_resched();
    5752           0 :                         ext4_lock_group(sb, group);
    5753             :                 }
    5754             : 
    5755           0 :                 if ((e4b.bd_info->bb_free - free_count) < minblocks)
    5756             :                         break;
    5757             :         }
    5758             : 
    5759           0 :         if (!ret) {
    5760           0 :                 ret = count;
    5761           0 :                 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
    5762             :         }
    5763           0 : out:
    5764           0 :         ext4_unlock_group(sb, group);
    5765           0 :         ext4_mb_unload_buddy(&e4b);
    5766             : 
    5767           0 :         ext4_debug("trimmed %d blocks in the group %d\n",
    5768             :                 count, group);
    5769             : 
    5770           0 :         return ret;
    5771             : }
    5772             : 
    5773             : /**
    5774             :  * ext4_trim_fs() -- trim ioctl handle function
    5775             :  * @sb:                 superblock for filesystem
    5776             :  * @range:              fstrim_range structure
    5777             :  *
    5778             :  * start:       First Byte to trim
    5779             :  * len:         number of Bytes to trim from start
    5780             :  * minlen:      minimum extent length in Bytes
    5781             :  * ext4_trim_fs goes through all allocation groups containing Bytes from
    5782             :  * start to start+len. For each such a group ext4_trim_all_free function
    5783             :  * is invoked to trim all free space.
    5784             :  */
    5785           0 : int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
    5786             : {
    5787           0 :         struct ext4_group_info *grp;
    5788           0 :         ext4_group_t group, first_group, last_group;
    5789           0 :         ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
    5790           0 :         uint64_t start, end, minlen, trimmed = 0;
    5791           0 :         ext4_fsblk_t first_data_blk =
    5792           0 :                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
    5793           0 :         ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
    5794           0 :         int ret = 0;
    5795             : 
    5796           0 :         start = range->start >> sb->s_blocksize_bits;
    5797           0 :         end = start + (range->len >> sb->s_blocksize_bits) - 1;
    5798           0 :         minlen = EXT4_NUM_B2C(EXT4_SB(sb),
    5799             :                               range->minlen >> sb->s_blocksize_bits);
    5800             : 
    5801           0 :         if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
    5802           0 :             start >= max_blks ||
    5803           0 :             range->len < sb->s_blocksize)
    5804             :                 return -EINVAL;
    5805           0 :         if (end >= max_blks)
    5806           0 :                 end = max_blks - 1;
    5807           0 :         if (end <= first_data_blk)
    5808           0 :                 goto out;
    5809           0 :         if (start < first_data_blk)
    5810             :                 start = first_data_blk;
    5811             : 
    5812             :         /* Determine first and last group to examine based on start and end */
    5813           0 :         ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
    5814             :                                      &first_group, &first_cluster);
    5815           0 :         ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
    5816             :                                      &last_group, &last_cluster);
    5817             : 
    5818             :         /* end now represents the last cluster to discard in this group */
    5819           0 :         end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
    5820             : 
    5821           0 :         for (group = first_group; group <= last_group; group++) {
    5822           0 :                 grp = ext4_get_group_info(sb, group);
    5823             :                 /* We only do this if the grp has never been initialized */
    5824           0 :                 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
    5825           0 :                         ret = ext4_mb_init_group(sb, group, GFP_NOFS);
    5826           0 :                         if (ret)
    5827             :                                 break;
    5828             :                 }
    5829             : 
    5830             :                 /*
    5831             :                  * For all the groups except the last one, last cluster will
    5832             :                  * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
    5833             :                  * change it for the last group, note that last_cluster is
    5834             :                  * already computed earlier by ext4_get_group_no_and_offset()
    5835             :                  */
    5836           0 :                 if (group == last_group)
    5837           0 :                         end = last_cluster;
    5838             : 
    5839           0 :                 if (grp->bb_free >= minlen) {
    5840           0 :                         cnt = ext4_trim_all_free(sb, group, first_cluster,
    5841             :                                                 end, minlen);
    5842           0 :                         if (cnt < 0) {
    5843             :                                 ret = cnt;
    5844             :                                 break;
    5845             :                         }
    5846           0 :                         trimmed += cnt;
    5847             :                 }
    5848             : 
    5849             :                 /*
    5850             :                  * For every group except the first one, we are sure
    5851             :                  * that the first cluster to discard will be cluster #0.
    5852             :                  */
    5853           0 :                 first_cluster = 0;
    5854             :         }
    5855             : 
    5856           0 :         if (!ret)
    5857           0 :                 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
    5858             : 
    5859           0 : out:
    5860           0 :         range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
    5861           0 :         return ret;
    5862             : }
    5863             : 
    5864             : /* Iterate all the free extents in the group. */
    5865             : int
    5866           0 : ext4_mballoc_query_range(
    5867             :         struct super_block              *sb,
    5868             :         ext4_group_t                    group,
    5869             :         ext4_grpblk_t                   start,
    5870             :         ext4_grpblk_t                   end,
    5871             :         ext4_mballoc_query_range_fn     formatter,
    5872             :         void                            *priv)
    5873             : {
    5874           0 :         void                            *bitmap;
    5875           0 :         ext4_grpblk_t                   next;
    5876           0 :         struct ext4_buddy               e4b;
    5877           0 :         int                             error;
    5878             : 
    5879           0 :         error = ext4_mb_load_buddy(sb, group, &e4b);
    5880           0 :         if (error)
    5881             :                 return error;
    5882           0 :         bitmap = e4b.bd_bitmap;
    5883             : 
    5884           0 :         ext4_lock_group(sb, group);
    5885             : 
    5886           0 :         start = (e4b.bd_info->bb_first_free > start) ?
    5887             :                 e4b.bd_info->bb_first_free : start;
    5888           0 :         if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
    5889           0 :                 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
    5890             : 
    5891           0 :         while (start <= end) {
    5892           0 :                 start = mb_find_next_zero_bit(bitmap, end + 1, start);
    5893           0 :                 if (start > end)
    5894             :                         break;
    5895           0 :                 next = mb_find_next_bit(bitmap, end + 1, start);
    5896             : 
    5897           0 :                 ext4_unlock_group(sb, group);
    5898           0 :                 error = formatter(sb, group, start, next - start, priv);
    5899           0 :                 if (error)
    5900           0 :                         goto out_unload;
    5901           0 :                 ext4_lock_group(sb, group);
    5902             : 
    5903           0 :                 start = next + 1;
    5904             :         }
    5905             : 
    5906           0 :         ext4_unlock_group(sb, group);
    5907           0 : out_unload:
    5908           0 :         ext4_mb_unload_buddy(&e4b);
    5909             : 
    5910           0 :         return error;
    5911             : }

Generated by: LCOV version 1.14