Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * fs/ext4/extents_status.c
4 : *
5 : * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
6 : * Modified by
7 : * Allison Henderson <achender@linux.vnet.ibm.com>
8 : * Hugh Dickins <hughd@google.com>
9 : * Zheng Liu <wenqing.lz@taobao.com>
10 : *
11 : * Ext4 extents status tree core functions.
12 : */
13 : #include <linux/list_sort.h>
14 : #include <linux/proc_fs.h>
15 : #include <linux/seq_file.h>
16 : #include "ext4.h"
17 :
18 : #include <trace/events/ext4.h>
19 :
20 : /*
21 : * According to previous discussion in Ext4 Developer Workshop, we
22 : * will introduce a new structure called io tree to track all extent
23 : * status in order to solve some problems that we have met
24 : * (e.g. Reservation space warning), and provide extent-level locking.
25 : * Delay extent tree is the first step to achieve this goal. It is
26 : * original built by Yongqiang Yang. At that time it is called delay
27 : * extent tree, whose goal is only track delayed extents in memory to
28 : * simplify the implementation of fiemap and bigalloc, and introduce
29 : * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
30 : * delay extent tree at the first commit. But for better understand
31 : * what it does, it has been rename to extent status tree.
32 : *
33 : * Step1:
34 : * Currently the first step has been done. All delayed extents are
35 : * tracked in the tree. It maintains the delayed extent when a delayed
36 : * allocation is issued, and the delayed extent is written out or
37 : * invalidated. Therefore the implementation of fiemap and bigalloc
38 : * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
39 : *
40 : * The following comment describes the implemenmtation of extent
41 : * status tree and future works.
42 : *
43 : * Step2:
44 : * In this step all extent status are tracked by extent status tree.
45 : * Thus, we can first try to lookup a block mapping in this tree before
46 : * finding it in extent tree. Hence, single extent cache can be removed
47 : * because extent status tree can do a better job. Extents in status
48 : * tree are loaded on-demand. Therefore, the extent status tree may not
49 : * contain all of the extents in a file. Meanwhile we define a shrinker
50 : * to reclaim memory from extent status tree because fragmented extent
51 : * tree will make status tree cost too much memory. written/unwritten/-
52 : * hole extents in the tree will be reclaimed by this shrinker when we
53 : * are under high memory pressure. Delayed extents will not be
54 : * reclimed because fiemap, bigalloc, and seek_data/hole need it.
55 : */
56 :
57 : /*
58 : * Extent status tree implementation for ext4.
59 : *
60 : *
61 : * ==========================================================================
62 : * Extent status tree tracks all extent status.
63 : *
64 : * 1. Why we need to implement extent status tree?
65 : *
66 : * Without extent status tree, ext4 identifies a delayed extent by looking
67 : * up page cache, this has several deficiencies - complicated, buggy,
68 : * and inefficient code.
69 : *
70 : * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
71 : * block or a range of blocks are belonged to a delayed extent.
72 : *
73 : * Let us have a look at how they do without extent status tree.
74 : * -- FIEMAP
75 : * FIEMAP looks up page cache to identify delayed allocations from holes.
76 : *
77 : * -- SEEK_HOLE/DATA
78 : * SEEK_HOLE/DATA has the same problem as FIEMAP.
79 : *
80 : * -- bigalloc
81 : * bigalloc looks up page cache to figure out if a block is
82 : * already under delayed allocation or not to determine whether
83 : * quota reserving is needed for the cluster.
84 : *
85 : * -- writeout
86 : * Writeout looks up whole page cache to see if a buffer is
87 : * mapped, If there are not very many delayed buffers, then it is
88 : * time consuming.
89 : *
90 : * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
91 : * bigalloc and writeout can figure out if a block or a range of
92 : * blocks is under delayed allocation(belonged to a delayed extent) or
93 : * not by searching the extent tree.
94 : *
95 : *
96 : * ==========================================================================
97 : * 2. Ext4 extent status tree impelmentation
98 : *
99 : * -- extent
100 : * A extent is a range of blocks which are contiguous logically and
101 : * physically. Unlike extent in extent tree, this extent in ext4 is
102 : * a in-memory struct, there is no corresponding on-disk data. There
103 : * is no limit on length of extent, so an extent can contain as many
104 : * blocks as they are contiguous logically and physically.
105 : *
106 : * -- extent status tree
107 : * Every inode has an extent status tree and all allocation blocks
108 : * are added to the tree with different status. The extent in the
109 : * tree are ordered by logical block no.
110 : *
111 : * -- operations on a extent status tree
112 : * There are three important operations on a delayed extent tree: find
113 : * next extent, adding a extent(a range of blocks) and removing a extent.
114 : *
115 : * -- race on a extent status tree
116 : * Extent status tree is protected by inode->i_es_lock.
117 : *
118 : * -- memory consumption
119 : * Fragmented extent tree will make extent status tree cost too much
120 : * memory. Hence, we will reclaim written/unwritten/hole extents from
121 : * the tree under a heavy memory pressure.
122 : *
123 : *
124 : * ==========================================================================
125 : * 3. Performance analysis
126 : *
127 : * -- overhead
128 : * 1. There is a cache extent for write access, so if writes are
129 : * not very random, adding space operaions are in O(1) time.
130 : *
131 : * -- gain
132 : * 2. Code is much simpler, more readable, more maintainable and
133 : * more efficient.
134 : *
135 : *
136 : * ==========================================================================
137 : * 4. TODO list
138 : *
139 : * -- Refactor delayed space reservation
140 : *
141 : * -- Extent-level locking
142 : */
143 :
144 : static struct kmem_cache *ext4_es_cachep;
145 : static struct kmem_cache *ext4_pending_cachep;
146 :
147 : static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
148 : static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
149 : ext4_lblk_t end, int *reserved);
150 : static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
151 : static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
152 : struct ext4_inode_info *locked_ei);
153 : static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
154 : ext4_lblk_t len);
155 :
156 1 : int __init ext4_init_es(void)
157 : {
158 1 : ext4_es_cachep = kmem_cache_create("ext4_extent_status",
159 : sizeof(struct extent_status),
160 : 0, (SLAB_RECLAIM_ACCOUNT), NULL);
161 1 : if (ext4_es_cachep == NULL)
162 0 : return -ENOMEM;
163 : return 0;
164 : }
165 :
166 0 : void ext4_exit_es(void)
167 : {
168 0 : kmem_cache_destroy(ext4_es_cachep);
169 0 : }
170 :
171 5559 : void ext4_es_init_tree(struct ext4_es_tree *tree)
172 : {
173 5559 : tree->root = RB_ROOT;
174 5559 : tree->cache_es = NULL;
175 5559 : }
176 :
177 : #ifdef ES_DEBUG__
178 : static void ext4_es_print_tree(struct inode *inode)
179 : {
180 : struct ext4_es_tree *tree;
181 : struct rb_node *node;
182 :
183 : printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
184 : tree = &EXT4_I(inode)->i_es_tree;
185 : node = rb_first(&tree->root);
186 : while (node) {
187 : struct extent_status *es;
188 : es = rb_entry(node, struct extent_status, rb_node);
189 : printk(KERN_DEBUG " [%u/%u) %llu %x",
190 : es->es_lblk, es->es_len,
191 : ext4_es_pblock(es), ext4_es_status(es));
192 : node = rb_next(node);
193 : }
194 : printk(KERN_DEBUG "\n");
195 : }
196 : #else
197 : #define ext4_es_print_tree(inode)
198 : #endif
199 :
200 23206 : static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
201 : {
202 23206 : BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
203 23206 : return es->es_lblk + es->es_len - 1;
204 : }
205 :
206 : /*
207 : * search through the tree for an delayed extent with a given offset. If
208 : * it can't be found, try to find next extent.
209 : */
210 6426 : static struct extent_status *__es_tree_search(struct rb_root *root,
211 : ext4_lblk_t lblk)
212 : {
213 6426 : struct rb_node *node = root->rb_node;
214 6426 : struct extent_status *es = NULL;
215 :
216 9360 : while (node) {
217 6423 : es = rb_entry(node, struct extent_status, rb_node);
218 6423 : if (lblk < es->es_lblk)
219 369 : node = node->rb_left;
220 6054 : else if (lblk > ext4_es_end(es))
221 2565 : node = node->rb_right;
222 : else
223 3489 : return es;
224 : }
225 :
226 2937 : if (es && lblk < es->es_lblk)
227 : return es;
228 :
229 2937 : if (es && lblk > ext4_es_end(es)) {
230 489 : node = rb_next(&es->rb_node);
231 978 : return node ? rb_entry(node, struct extent_status, rb_node) :
232 : NULL;
233 : }
234 :
235 : return NULL;
236 : }
237 :
238 : /*
239 : * ext4_es_find_extent_range - find extent with specified status within block
240 : * range or next extent following block range in
241 : * extents status tree
242 : *
243 : * @inode - file containing the range
244 : * @matching_fn - pointer to function that matches extents with desired status
245 : * @lblk - logical block defining start of range
246 : * @end - logical block defining end of range
247 : * @es - extent found, if any
248 : *
249 : * Find the first extent within the block range specified by @lblk and @end
250 : * in the extents status tree that satisfies @matching_fn. If a match
251 : * is found, it's returned in @es. If not, and a matching extent is found
252 : * beyond the block range, it's returned in @es. If no match is found, an
253 : * extent is returned in @es whose es_lblk, es_len, and es_pblk components
254 : * are 0.
255 : */
256 623 : static void __es_find_extent_range(struct inode *inode,
257 : int (*matching_fn)(struct extent_status *es),
258 : ext4_lblk_t lblk, ext4_lblk_t end,
259 : struct extent_status *es)
260 : {
261 623 : struct ext4_es_tree *tree = NULL;
262 623 : struct extent_status *es1 = NULL;
263 623 : struct rb_node *node;
264 :
265 623 : WARN_ON(es == NULL);
266 623 : WARN_ON(end < lblk);
267 :
268 623 : tree = &EXT4_I(inode)->i_es_tree;
269 :
270 : /* see if the extent has been cached */
271 623 : es->es_lblk = es->es_len = es->es_pblk = 0;
272 623 : if (tree->cache_es) {
273 6 : es1 = tree->cache_es;
274 6 : if (in_range(lblk, es1->es_lblk, es1->es_len)) {
275 0 : es_debug("%u cached by [%u/%u) %llu %x\n",
276 : lblk, es1->es_lblk, es1->es_len,
277 : ext4_es_pblock(es1), ext4_es_status(es1));
278 0 : goto out;
279 : }
280 : }
281 :
282 623 : es1 = __es_tree_search(&tree->root, lblk);
283 :
284 623 : out:
285 623 : if (es1 && !matching_fn(es1)) {
286 0 : while ((node = rb_next(&es1->rb_node)) != NULL) {
287 0 : es1 = rb_entry(node, struct extent_status, rb_node);
288 0 : if (es1->es_lblk > end) {
289 : es1 = NULL;
290 : break;
291 : }
292 0 : if (matching_fn(es1))
293 : break;
294 : }
295 : }
296 :
297 623 : if (es1 && matching_fn(es1)) {
298 0 : tree->cache_es = es1;
299 0 : es->es_lblk = es1->es_lblk;
300 0 : es->es_len = es1->es_len;
301 0 : es->es_pblk = es1->es_pblk;
302 : }
303 :
304 623 : }
305 :
306 : /*
307 : * Locking for __es_find_extent_range() for external use
308 : */
309 623 : void ext4_es_find_extent_range(struct inode *inode,
310 : int (*matching_fn)(struct extent_status *es),
311 : ext4_lblk_t lblk, ext4_lblk_t end,
312 : struct extent_status *es)
313 : {
314 623 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
315 : return;
316 :
317 623 : trace_ext4_es_find_extent_range_enter(inode, lblk);
318 :
319 623 : read_lock(&EXT4_I(inode)->i_es_lock);
320 623 : __es_find_extent_range(inode, matching_fn, lblk, end, es);
321 623 : read_unlock(&EXT4_I(inode)->i_es_lock);
322 :
323 623 : trace_ext4_es_find_extent_range_exit(inode, es);
324 : }
325 :
326 : /*
327 : * __es_scan_range - search block range for block with specified status
328 : * in extents status tree
329 : *
330 : * @inode - file containing the range
331 : * @matching_fn - pointer to function that matches extents with desired status
332 : * @lblk - logical block defining start of range
333 : * @end - logical block defining end of range
334 : *
335 : * Returns true if at least one block in the specified block range satisfies
336 : * the criterion specified by @matching_fn, and false if not. If at least
337 : * one extent has the specified status, then there is at least one block
338 : * in the cluster with that status. Should only be called by code that has
339 : * taken i_es_lock.
340 : */
341 0 : static bool __es_scan_range(struct inode *inode,
342 : int (*matching_fn)(struct extent_status *es),
343 : ext4_lblk_t start, ext4_lblk_t end)
344 : {
345 0 : struct extent_status es;
346 :
347 0 : __es_find_extent_range(inode, matching_fn, start, end, &es);
348 0 : if (es.es_len == 0)
349 : return false; /* no matching extent in the tree */
350 0 : else if (es.es_lblk <= start &&
351 0 : start < es.es_lblk + es.es_len)
352 : return true;
353 0 : else if (start <= es.es_lblk && es.es_lblk <= end)
354 : return true;
355 : else
356 0 : return false;
357 : }
358 : /*
359 : * Locking for __es_scan_range() for external use
360 : */
361 0 : bool ext4_es_scan_range(struct inode *inode,
362 : int (*matching_fn)(struct extent_status *es),
363 : ext4_lblk_t lblk, ext4_lblk_t end)
364 : {
365 0 : bool ret;
366 :
367 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
368 : return false;
369 :
370 0 : read_lock(&EXT4_I(inode)->i_es_lock);
371 0 : ret = __es_scan_range(inode, matching_fn, lblk, end);
372 0 : read_unlock(&EXT4_I(inode)->i_es_lock);
373 :
374 0 : return ret;
375 : }
376 :
377 : /*
378 : * __es_scan_clu - search cluster for block with specified status in
379 : * extents status tree
380 : *
381 : * @inode - file containing the cluster
382 : * @matching_fn - pointer to function that matches extents with desired status
383 : * @lblk - logical block in cluster to be searched
384 : *
385 : * Returns true if at least one extent in the cluster containing @lblk
386 : * satisfies the criterion specified by @matching_fn, and false if not. If at
387 : * least one extent has the specified status, then there is at least one block
388 : * in the cluster with that status. Should only be called by code that has
389 : * taken i_es_lock.
390 : */
391 0 : static bool __es_scan_clu(struct inode *inode,
392 : int (*matching_fn)(struct extent_status *es),
393 : ext4_lblk_t lblk)
394 : {
395 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
396 0 : ext4_lblk_t lblk_start, lblk_end;
397 :
398 0 : lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
399 0 : lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
400 :
401 0 : return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
402 : }
403 :
404 : /*
405 : * Locking for __es_scan_clu() for external use
406 : */
407 0 : bool ext4_es_scan_clu(struct inode *inode,
408 : int (*matching_fn)(struct extent_status *es),
409 : ext4_lblk_t lblk)
410 : {
411 0 : bool ret;
412 :
413 0 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
414 : return false;
415 :
416 0 : read_lock(&EXT4_I(inode)->i_es_lock);
417 0 : ret = __es_scan_clu(inode, matching_fn, lblk);
418 0 : read_unlock(&EXT4_I(inode)->i_es_lock);
419 :
420 0 : return ret;
421 : }
422 :
423 2452 : static void ext4_es_list_add(struct inode *inode)
424 : {
425 2452 : struct ext4_inode_info *ei = EXT4_I(inode);
426 2452 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
427 :
428 2452 : if (!list_empty(&ei->i_es_list))
429 : return;
430 :
431 2452 : spin_lock(&sbi->s_es_lock);
432 2452 : if (list_empty(&ei->i_es_list)) {
433 2452 : list_add_tail(&ei->i_es_list, &sbi->s_es_list);
434 2452 : sbi->s_es_nr_inode++;
435 : }
436 2452 : spin_unlock(&sbi->s_es_lock);
437 : }
438 :
439 953 : static void ext4_es_list_del(struct inode *inode)
440 : {
441 953 : struct ext4_inode_info *ei = EXT4_I(inode);
442 953 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
443 :
444 953 : spin_lock(&sbi->s_es_lock);
445 953 : if (!list_empty(&ei->i_es_list)) {
446 953 : list_del_init(&ei->i_es_list);
447 953 : sbi->s_es_nr_inode--;
448 953 : WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
449 : }
450 953 : spin_unlock(&sbi->s_es_lock);
451 953 : }
452 :
453 : static struct extent_status *
454 3733 : ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
455 : ext4_fsblk_t pblk)
456 : {
457 3733 : struct extent_status *es;
458 3733 : es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
459 3733 : if (es == NULL)
460 : return NULL;
461 3733 : es->es_lblk = lblk;
462 3733 : es->es_len = len;
463 3733 : es->es_pblk = pblk;
464 :
465 : /*
466 : * We don't count delayed extent because we never try to reclaim them
467 : */
468 3733 : if (!ext4_es_is_delayed(es)) {
469 3273 : if (!EXT4_I(inode)->i_es_shk_nr++)
470 2452 : ext4_es_list_add(inode);
471 3273 : percpu_counter_inc(&EXT4_SB(inode->i_sb)->
472 : s_es_stats.es_stats_shk_cnt);
473 : }
474 :
475 3733 : EXT4_I(inode)->i_es_all_nr++;
476 3733 : percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
477 :
478 3733 : return es;
479 : }
480 :
481 1275 : static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
482 : {
483 1275 : EXT4_I(inode)->i_es_all_nr--;
484 1275 : percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
485 :
486 : /* Decrease the shrink counter when this es is not delayed */
487 1275 : if (!ext4_es_is_delayed(es)) {
488 1173 : BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
489 1173 : if (!--EXT4_I(inode)->i_es_shk_nr)
490 953 : ext4_es_list_del(inode);
491 1173 : percpu_counter_dec(&EXT4_SB(inode->i_sb)->
492 : s_es_stats.es_stats_shk_cnt);
493 : }
494 :
495 1275 : kmem_cache_free(ext4_es_cachep, es);
496 1275 : }
497 :
498 : /*
499 : * Check whether or not two extents can be merged
500 : * Condition:
501 : * - logical block number is contiguous
502 : * - physical block number is contiguous
503 : * - status is equal
504 : */
505 7162 : static int ext4_es_can_be_merged(struct extent_status *es1,
506 : struct extent_status *es2)
507 : {
508 7162 : if (ext4_es_type(es1) != ext4_es_type(es2))
509 : return 0;
510 :
511 3870 : if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
512 0 : pr_warn("ES assertion failed when merging extents. "
513 : "The sum of lengths of es1 (%d) and es2 (%d) "
514 : "is bigger than allowed file size (%d)\n",
515 : es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
516 0 : WARN_ON(1);
517 0 : return 0;
518 : }
519 :
520 3870 : if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
521 : return 0;
522 :
523 1977 : if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
524 734 : (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
525 : return 1;
526 :
527 1767 : if (ext4_es_is_hole(es1))
528 : return 1;
529 :
530 : /* we need to check delayed extent is without unwritten status */
531 1767 : if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
532 1243 : return 1;
533 :
534 : return 0;
535 : }
536 :
537 : static struct extent_status *
538 199 : ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
539 : {
540 199 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
541 199 : struct extent_status *es1;
542 199 : struct rb_node *node;
543 :
544 199 : node = rb_prev(&es->rb_node);
545 199 : if (!node)
546 : return es;
547 :
548 0 : es1 = rb_entry(node, struct extent_status, rb_node);
549 0 : if (ext4_es_can_be_merged(es1, es)) {
550 0 : es1->es_len += es->es_len;
551 0 : if (ext4_es_is_referenced(es))
552 0 : ext4_es_set_referenced(es1);
553 0 : rb_erase(&es->rb_node, &tree->root);
554 0 : ext4_es_free_extent(inode, es);
555 0 : es = es1;
556 : }
557 :
558 : return es;
559 : }
560 :
561 : static struct extent_status *
562 1254 : ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
563 : {
564 1254 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
565 1254 : struct extent_status *es1;
566 1254 : struct rb_node *node;
567 :
568 1254 : node = rb_next(&es->rb_node);
569 1254 : if (!node)
570 : return es;
571 :
572 1251 : es1 = rb_entry(node, struct extent_status, rb_node);
573 1251 : if (ext4_es_can_be_merged(es, es1)) {
574 0 : es->es_len += es1->es_len;
575 0 : if (ext4_es_is_referenced(es1))
576 0 : ext4_es_set_referenced(es);
577 0 : rb_erase(node, &tree->root);
578 0 : ext4_es_free_extent(inode, es1);
579 : }
580 :
581 : return es;
582 : }
583 :
584 : #ifdef ES_AGGRESSIVE_TEST
585 : #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
586 :
587 : static void ext4_es_insert_extent_ext_check(struct inode *inode,
588 : struct extent_status *es)
589 : {
590 : struct ext4_ext_path *path = NULL;
591 : struct ext4_extent *ex;
592 : ext4_lblk_t ee_block;
593 : ext4_fsblk_t ee_start;
594 : unsigned short ee_len;
595 : int depth, ee_status, es_status;
596 :
597 : path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
598 : if (IS_ERR(path))
599 : return;
600 :
601 : depth = ext_depth(inode);
602 : ex = path[depth].p_ext;
603 :
604 : if (ex) {
605 :
606 : ee_block = le32_to_cpu(ex->ee_block);
607 : ee_start = ext4_ext_pblock(ex);
608 : ee_len = ext4_ext_get_actual_len(ex);
609 :
610 : ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
611 : es_status = ext4_es_is_unwritten(es) ? 1 : 0;
612 :
613 : /*
614 : * Make sure ex and es are not overlap when we try to insert
615 : * a delayed/hole extent.
616 : */
617 : if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
618 : if (in_range(es->es_lblk, ee_block, ee_len)) {
619 : pr_warn("ES insert assertion failed for "
620 : "inode: %lu we can find an extent "
621 : "at block [%d/%d/%llu/%c], but we "
622 : "want to add a delayed/hole extent "
623 : "[%d/%d/%llu/%x]\n",
624 : inode->i_ino, ee_block, ee_len,
625 : ee_start, ee_status ? 'u' : 'w',
626 : es->es_lblk, es->es_len,
627 : ext4_es_pblock(es), ext4_es_status(es));
628 : }
629 : goto out;
630 : }
631 :
632 : /*
633 : * We don't check ee_block == es->es_lblk, etc. because es
634 : * might be a part of whole extent, vice versa.
635 : */
636 : if (es->es_lblk < ee_block ||
637 : ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
638 : pr_warn("ES insert assertion failed for inode: %lu "
639 : "ex_status [%d/%d/%llu/%c] != "
640 : "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
641 : ee_block, ee_len, ee_start,
642 : ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
643 : ext4_es_pblock(es), es_status ? 'u' : 'w');
644 : goto out;
645 : }
646 :
647 : if (ee_status ^ es_status) {
648 : pr_warn("ES insert assertion failed for inode: %lu "
649 : "ex_status [%d/%d/%llu/%c] != "
650 : "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
651 : ee_block, ee_len, ee_start,
652 : ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
653 : ext4_es_pblock(es), es_status ? 'u' : 'w');
654 : }
655 : } else {
656 : /*
657 : * We can't find an extent on disk. So we need to make sure
658 : * that we don't want to add an written/unwritten extent.
659 : */
660 : if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
661 : pr_warn("ES insert assertion failed for inode: %lu "
662 : "can't find an extent at block %d but we want "
663 : "to add a written/unwritten extent "
664 : "[%d/%d/%llu/%x]\n", inode->i_ino,
665 : es->es_lblk, es->es_lblk, es->es_len,
666 : ext4_es_pblock(es), ext4_es_status(es));
667 : }
668 : }
669 : out:
670 : ext4_ext_drop_refs(path);
671 : kfree(path);
672 : }
673 :
674 : static void ext4_es_insert_extent_ind_check(struct inode *inode,
675 : struct extent_status *es)
676 : {
677 : struct ext4_map_blocks map;
678 : int retval;
679 :
680 : /*
681 : * Here we call ext4_ind_map_blocks to lookup a block mapping because
682 : * 'Indirect' structure is defined in indirect.c. So we couldn't
683 : * access direct/indirect tree from outside. It is too dirty to define
684 : * this function in indirect.c file.
685 : */
686 :
687 : map.m_lblk = es->es_lblk;
688 : map.m_len = es->es_len;
689 :
690 : retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
691 : if (retval > 0) {
692 : if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
693 : /*
694 : * We want to add a delayed/hole extent but this
695 : * block has been allocated.
696 : */
697 : pr_warn("ES insert assertion failed for inode: %lu "
698 : "We can find blocks but we want to add a "
699 : "delayed/hole extent [%d/%d/%llu/%x]\n",
700 : inode->i_ino, es->es_lblk, es->es_len,
701 : ext4_es_pblock(es), ext4_es_status(es));
702 : return;
703 : } else if (ext4_es_is_written(es)) {
704 : if (retval != es->es_len) {
705 : pr_warn("ES insert assertion failed for "
706 : "inode: %lu retval %d != es_len %d\n",
707 : inode->i_ino, retval, es->es_len);
708 : return;
709 : }
710 : if (map.m_pblk != ext4_es_pblock(es)) {
711 : pr_warn("ES insert assertion failed for "
712 : "inode: %lu m_pblk %llu != "
713 : "es_pblk %llu\n",
714 : inode->i_ino, map.m_pblk,
715 : ext4_es_pblock(es));
716 : return;
717 : }
718 : } else {
719 : /*
720 : * We don't need to check unwritten extent because
721 : * indirect-based file doesn't have it.
722 : */
723 : BUG();
724 : }
725 : } else if (retval == 0) {
726 : if (ext4_es_is_written(es)) {
727 : pr_warn("ES insert assertion failed for inode: %lu "
728 : "We can't find the block but we want to add "
729 : "a written extent [%d/%d/%llu/%x]\n",
730 : inode->i_ino, es->es_lblk, es->es_len,
731 : ext4_es_pblock(es), ext4_es_status(es));
732 : return;
733 : }
734 : }
735 : }
736 :
737 : static inline void ext4_es_insert_extent_check(struct inode *inode,
738 : struct extent_status *es)
739 : {
740 : /*
741 : * We don't need to worry about the race condition because
742 : * caller takes i_data_sem locking.
743 : */
744 : BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
745 : if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
746 : ext4_es_insert_extent_ext_check(inode, es);
747 : else
748 : ext4_es_insert_extent_ind_check(inode, es);
749 : }
750 : #else
751 3674 : static inline void ext4_es_insert_extent_check(struct inode *inode,
752 : struct extent_status *es)
753 : {
754 3674 : }
755 : #endif
756 :
757 5186 : static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
758 : {
759 5186 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
760 5186 : struct rb_node **p = &tree->root.rb_node;
761 5186 : struct rb_node *parent = NULL;
762 5186 : struct extent_status *es;
763 :
764 9644 : while (*p) {
765 5911 : parent = *p;
766 5911 : es = rb_entry(parent, struct extent_status, rb_node);
767 :
768 5911 : if (newes->es_lblk < es->es_lblk) {
769 2193 : if (ext4_es_can_be_merged(newes, es)) {
770 : /*
771 : * Here we can modify es_lblk directly
772 : * because it isn't overlapped.
773 : */
774 199 : es->es_lblk = newes->es_lblk;
775 199 : es->es_len += newes->es_len;
776 199 : if (ext4_es_is_written(es) ||
777 0 : ext4_es_is_unwritten(es))
778 199 : ext4_es_store_pblock(es,
779 : newes->es_pblk);
780 199 : es = ext4_es_try_to_merge_left(inode, es);
781 199 : goto out;
782 : }
783 1994 : p = &(*p)->rb_left;
784 3718 : } else if (newes->es_lblk > ext4_es_end(es)) {
785 3718 : if (ext4_es_can_be_merged(es, newes)) {
786 1254 : es->es_len += newes->es_len;
787 1254 : es = ext4_es_try_to_merge_right(inode, es);
788 1254 : goto out;
789 : }
790 2464 : p = &(*p)->rb_right;
791 : } else {
792 0 : BUG();
793 : return -EINVAL;
794 : }
795 : }
796 :
797 3733 : es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
798 : newes->es_pblk);
799 3733 : if (!es)
800 : return -ENOMEM;
801 3733 : rb_link_node(&es->rb_node, parent, p);
802 3733 : rb_insert_color(&es->rb_node, &tree->root);
803 :
804 5186 : out:
805 5186 : tree->cache_es = es;
806 5186 : return 0;
807 : }
808 :
809 : /*
810 : * ext4_es_insert_extent() adds information to an inode's extent
811 : * status tree.
812 : *
813 : * Return 0 on success, error code on failure.
814 : */
815 1971 : int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
816 : ext4_lblk_t len, ext4_fsblk_t pblk,
817 : unsigned int status)
818 : {
819 1971 : struct extent_status newes;
820 1971 : ext4_lblk_t end = lblk + len - 1;
821 1971 : int err = 0;
822 1971 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
823 :
824 1971 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
825 : return 0;
826 :
827 1971 : es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
828 : lblk, len, pblk, status, inode->i_ino);
829 :
830 1971 : if (!len)
831 : return 0;
832 :
833 1971 : BUG_ON(end < lblk);
834 :
835 1971 : if ((status & EXTENT_STATUS_DELAYED) &&
836 : (status & EXTENT_STATUS_WRITTEN)) {
837 0 : ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
838 : " delayed and written which can potentially "
839 : " cause data loss.", lblk, len);
840 0 : WARN_ON(1);
841 : }
842 :
843 1971 : newes.es_lblk = lblk;
844 1971 : newes.es_len = len;
845 1971 : ext4_es_store_pblock_status(&newes, pblk, status);
846 1971 : trace_ext4_es_insert_extent(inode, &newes);
847 :
848 1971 : ext4_es_insert_extent_check(inode, &newes);
849 :
850 1971 : write_lock(&EXT4_I(inode)->i_es_lock);
851 1971 : err = __es_remove_extent(inode, lblk, end, NULL);
852 1971 : if (err != 0)
853 0 : goto error;
854 1971 : retry:
855 1971 : err = __es_insert_extent(inode, &newes);
856 1971 : if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
857 0 : 128, EXT4_I(inode)))
858 0 : goto retry;
859 1971 : if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
860 0 : err = 0;
861 :
862 1971 : if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
863 0 : (status & EXTENT_STATUS_WRITTEN ||
864 : status & EXTENT_STATUS_UNWRITTEN))
865 0 : __revise_pending(inode, lblk, len);
866 :
867 1971 : error:
868 1971 : write_unlock(&EXT4_I(inode)->i_es_lock);
869 :
870 1971 : ext4_es_print_tree(inode);
871 :
872 1971 : return err;
873 : }
874 :
875 : /*
876 : * ext4_es_cache_extent() inserts information into the extent status
877 : * tree if and only if there isn't information about the range in
878 : * question already.
879 : */
880 1623 : void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
881 : ext4_lblk_t len, ext4_fsblk_t pblk,
882 : unsigned int status)
883 : {
884 1623 : struct extent_status *es;
885 1623 : struct extent_status newes;
886 1623 : ext4_lblk_t end = lblk + len - 1;
887 :
888 1623 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
889 0 : return;
890 :
891 1623 : newes.es_lblk = lblk;
892 1623 : newes.es_len = len;
893 1623 : ext4_es_store_pblock_status(&newes, pblk, status);
894 1623 : trace_ext4_es_cache_extent(inode, &newes);
895 :
896 1623 : if (!len)
897 : return;
898 :
899 1623 : BUG_ON(end < lblk);
900 :
901 1623 : write_lock(&EXT4_I(inode)->i_es_lock);
902 :
903 1623 : es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
904 1623 : if (!es || es->es_lblk > end)
905 1512 : __es_insert_extent(inode, &newes);
906 1623 : write_unlock(&EXT4_I(inode)->i_es_lock);
907 : }
908 :
909 : /*
910 : * ext4_es_lookup_extent() looks up an extent in extent status tree.
911 : *
912 : * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
913 : *
914 : * Return: 1 on found, 0 on not
915 : */
916 17979 : int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
917 : ext4_lblk_t *next_lblk,
918 : struct extent_status *es)
919 : {
920 17979 : struct ext4_es_tree *tree;
921 17979 : struct ext4_es_stats *stats;
922 17979 : struct extent_status *es1 = NULL;
923 17979 : struct rb_node *node;
924 17979 : int found = 0;
925 :
926 17979 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
927 : return 0;
928 :
929 17979 : trace_ext4_es_lookup_extent_enter(inode, lblk);
930 17979 : es_debug("lookup extent in block %u\n", lblk);
931 :
932 17979 : tree = &EXT4_I(inode)->i_es_tree;
933 17979 : read_lock(&EXT4_I(inode)->i_es_lock);
934 :
935 : /* find extent in cache firstly */
936 17979 : es->es_lblk = es->es_len = es->es_pblk = 0;
937 17979 : if (tree->cache_es) {
938 16325 : es1 = tree->cache_es;
939 16325 : if (in_range(lblk, es1->es_lblk, es1->es_len)) {
940 12459 : es_debug("%u cached by [%u/%u)\n",
941 : lblk, es1->es_lblk, es1->es_len);
942 12459 : found = 1;
943 12459 : goto out;
944 : }
945 : }
946 :
947 5520 : node = tree->root.rb_node;
948 9083 : while (node) {
949 7425 : es1 = rb_entry(node, struct extent_status, rb_node);
950 7425 : if (lblk < es1->es_lblk)
951 1202 : node = node->rb_left;
952 6223 : else if (lblk > ext4_es_end(es1))
953 2361 : node = node->rb_right;
954 : else {
955 : found = 1;
956 : break;
957 : }
958 : }
959 :
960 1658 : out:
961 17979 : stats = &EXT4_SB(inode->i_sb)->s_es_stats;
962 17979 : if (found) {
963 16321 : BUG_ON(!es1);
964 16321 : es->es_lblk = es1->es_lblk;
965 16321 : es->es_len = es1->es_len;
966 16321 : es->es_pblk = es1->es_pblk;
967 16321 : if (!ext4_es_is_referenced(es1))
968 1196 : ext4_es_set_referenced(es1);
969 16321 : percpu_counter_inc(&stats->es_stats_cache_hits);
970 16321 : if (next_lblk) {
971 0 : node = rb_next(&es1->rb_node);
972 0 : if (node) {
973 0 : es1 = rb_entry(node, struct extent_status,
974 : rb_node);
975 0 : *next_lblk = es1->es_lblk;
976 : } else
977 0 : *next_lblk = 0;
978 : }
979 : } else {
980 1658 : percpu_counter_inc(&stats->es_stats_cache_misses);
981 : }
982 :
983 17979 : read_unlock(&EXT4_I(inode)->i_es_lock);
984 :
985 17979 : trace_ext4_es_lookup_extent_exit(inode, es, found);
986 17979 : return found;
987 : }
988 :
989 : struct rsvd_count {
990 : int ndelonly;
991 : bool first_do_lblk_found;
992 : ext4_lblk_t first_do_lblk;
993 : ext4_lblk_t last_do_lblk;
994 : struct extent_status *left_es;
995 : bool partial;
996 : ext4_lblk_t lclu;
997 : };
998 :
999 : /*
1000 : * init_rsvd - initialize reserved count data before removing block range
1001 : * in file from extent status tree
1002 : *
1003 : * @inode - file containing range
1004 : * @lblk - first block in range
1005 : * @es - pointer to first extent in range
1006 : * @rc - pointer to reserved count data
1007 : *
1008 : * Assumes es is not NULL
1009 : */
1010 154 : static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
1011 : struct extent_status *es, struct rsvd_count *rc)
1012 : {
1013 154 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1014 154 : struct rb_node *node;
1015 :
1016 154 : rc->ndelonly = 0;
1017 :
1018 : /*
1019 : * for bigalloc, note the first delonly block in the range has not
1020 : * been found, record the extent containing the block to the left of
1021 : * the region to be removed, if any, and note that there's no partial
1022 : * cluster to track
1023 : */
1024 154 : if (sbi->s_cluster_ratio > 1) {
1025 0 : rc->first_do_lblk_found = false;
1026 0 : if (lblk > es->es_lblk) {
1027 0 : rc->left_es = es;
1028 : } else {
1029 0 : node = rb_prev(&es->rb_node);
1030 0 : rc->left_es = node ? rb_entry(node,
1031 : struct extent_status,
1032 0 : rb_node) : NULL;
1033 : }
1034 0 : rc->partial = false;
1035 : }
1036 154 : }
1037 :
1038 : /*
1039 : * count_rsvd - count the clusters containing delayed and not unwritten
1040 : * (delonly) blocks in a range within an extent and add to
1041 : * the running tally in rsvd_count
1042 : *
1043 : * @inode - file containing extent
1044 : * @lblk - first block in range
1045 : * @len - length of range in blocks
1046 : * @es - pointer to extent containing clusters to be counted
1047 : * @rc - pointer to reserved count data
1048 : *
1049 : * Tracks partial clusters found at the beginning and end of extents so
1050 : * they aren't overcounted when they span adjacent extents
1051 : */
1052 309 : static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1053 : struct extent_status *es, struct rsvd_count *rc)
1054 : {
1055 309 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1056 309 : ext4_lblk_t i, end, nclu;
1057 :
1058 309 : if (!ext4_es_is_delonly(es))
1059 : return;
1060 :
1061 39 : WARN_ON(len <= 0);
1062 :
1063 39 : if (sbi->s_cluster_ratio == 1) {
1064 39 : rc->ndelonly += (int) len;
1065 39 : return;
1066 : }
1067 :
1068 : /* bigalloc */
1069 :
1070 0 : i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1071 0 : end = lblk + (ext4_lblk_t) len - 1;
1072 0 : end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1073 :
1074 : /* record the first block of the first delonly extent seen */
1075 0 : if (!rc->first_do_lblk_found) {
1076 0 : rc->first_do_lblk = i;
1077 0 : rc->first_do_lblk_found = true;
1078 : }
1079 :
1080 : /* update the last lblk in the region seen so far */
1081 0 : rc->last_do_lblk = end;
1082 :
1083 : /*
1084 : * if we're tracking a partial cluster and the current extent
1085 : * doesn't start with it, count it and stop tracking
1086 : */
1087 0 : if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1088 0 : rc->ndelonly++;
1089 0 : rc->partial = false;
1090 : }
1091 :
1092 : /*
1093 : * if the first cluster doesn't start on a cluster boundary but
1094 : * ends on one, count it
1095 : */
1096 0 : if (EXT4_LBLK_COFF(sbi, i) != 0) {
1097 0 : if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1098 0 : rc->ndelonly++;
1099 0 : rc->partial = false;
1100 0 : i = EXT4_LBLK_CFILL(sbi, i) + 1;
1101 : }
1102 : }
1103 :
1104 : /*
1105 : * if the current cluster starts on a cluster boundary, count the
1106 : * number of whole delonly clusters in the extent
1107 : */
1108 0 : if ((i + sbi->s_cluster_ratio - 1) <= end) {
1109 0 : nclu = (end - i + 1) >> sbi->s_cluster_bits;
1110 0 : rc->ndelonly += nclu;
1111 0 : i += nclu << sbi->s_cluster_bits;
1112 : }
1113 :
1114 : /*
1115 : * start tracking a partial cluster if there's a partial at the end
1116 : * of the current extent and we're not already tracking one
1117 : */
1118 0 : if (!rc->partial && i <= end) {
1119 0 : rc->partial = true;
1120 0 : rc->lclu = EXT4_B2C(sbi, i);
1121 : }
1122 : }
1123 :
1124 : /*
1125 : * __pr_tree_search - search for a pending cluster reservation
1126 : *
1127 : * @root - root of pending reservation tree
1128 : * @lclu - logical cluster to search for
1129 : *
1130 : * Returns the pending reservation for the cluster identified by @lclu
1131 : * if found. If not, returns a reservation for the next cluster if any,
1132 : * and if not, returns NULL.
1133 : */
1134 0 : static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1135 : ext4_lblk_t lclu)
1136 : {
1137 0 : struct rb_node *node = root->rb_node;
1138 0 : struct pending_reservation *pr = NULL;
1139 :
1140 0 : while (node) {
1141 0 : pr = rb_entry(node, struct pending_reservation, rb_node);
1142 0 : if (lclu < pr->lclu)
1143 0 : node = node->rb_left;
1144 0 : else if (lclu > pr->lclu)
1145 0 : node = node->rb_right;
1146 : else
1147 0 : return pr;
1148 : }
1149 0 : if (pr && lclu < pr->lclu)
1150 : return pr;
1151 0 : if (pr && lclu > pr->lclu) {
1152 0 : node = rb_next(&pr->rb_node);
1153 0 : return node ? rb_entry(node, struct pending_reservation,
1154 0 : rb_node) : NULL;
1155 : }
1156 : return NULL;
1157 : }
1158 :
1159 : /*
1160 : * get_rsvd - calculates and returns the number of cluster reservations to be
1161 : * released when removing a block range from the extent status tree
1162 : * and releases any pending reservations within the range
1163 : *
1164 : * @inode - file containing block range
1165 : * @end - last block in range
1166 : * @right_es - pointer to extent containing next block beyond end or NULL
1167 : * @rc - pointer to reserved count data
1168 : *
1169 : * The number of reservations to be released is equal to the number of
1170 : * clusters containing delayed and not unwritten (delonly) blocks within
1171 : * the range, minus the number of clusters still containing delonly blocks
1172 : * at the ends of the range, and minus the number of pending reservations
1173 : * within the range.
1174 : */
1175 154 : static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1176 : struct extent_status *right_es,
1177 : struct rsvd_count *rc)
1178 : {
1179 154 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1180 154 : struct pending_reservation *pr;
1181 154 : struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1182 154 : struct rb_node *node;
1183 154 : ext4_lblk_t first_lclu, last_lclu;
1184 154 : bool left_delonly, right_delonly, count_pending;
1185 154 : struct extent_status *es;
1186 :
1187 154 : if (sbi->s_cluster_ratio > 1) {
1188 : /* count any remaining partial cluster */
1189 0 : if (rc->partial)
1190 0 : rc->ndelonly++;
1191 :
1192 0 : if (rc->ndelonly == 0)
1193 : return 0;
1194 :
1195 0 : first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1196 0 : last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1197 :
1198 : /*
1199 : * decrease the delonly count by the number of clusters at the
1200 : * ends of the range that still contain delonly blocks -
1201 : * these clusters still need to be reserved
1202 : */
1203 0 : left_delonly = right_delonly = false;
1204 :
1205 0 : es = rc->left_es;
1206 0 : while (es && ext4_es_end(es) >=
1207 0 : EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1208 0 : if (ext4_es_is_delonly(es)) {
1209 0 : rc->ndelonly--;
1210 0 : left_delonly = true;
1211 0 : break;
1212 : }
1213 0 : node = rb_prev(&es->rb_node);
1214 0 : if (!node)
1215 : break;
1216 0 : es = rb_entry(node, struct extent_status, rb_node);
1217 : }
1218 0 : if (right_es && (!left_delonly || first_lclu != last_lclu)) {
1219 0 : if (end < ext4_es_end(right_es)) {
1220 : es = right_es;
1221 : } else {
1222 0 : node = rb_next(&right_es->rb_node);
1223 0 : es = node ? rb_entry(node, struct extent_status,
1224 0 : rb_node) : NULL;
1225 : }
1226 0 : while (es && es->es_lblk <=
1227 0 : EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1228 0 : if (ext4_es_is_delonly(es)) {
1229 0 : rc->ndelonly--;
1230 0 : right_delonly = true;
1231 0 : break;
1232 : }
1233 0 : node = rb_next(&es->rb_node);
1234 0 : if (!node)
1235 : break;
1236 0 : es = rb_entry(node, struct extent_status,
1237 : rb_node);
1238 : }
1239 : }
1240 :
1241 : /*
1242 : * Determine the block range that should be searched for
1243 : * pending reservations, if any. Clusters on the ends of the
1244 : * original removed range containing delonly blocks are
1245 : * excluded. They've already been accounted for and it's not
1246 : * possible to determine if an associated pending reservation
1247 : * should be released with the information available in the
1248 : * extents status tree.
1249 : */
1250 0 : if (first_lclu == last_lclu) {
1251 0 : if (left_delonly | right_delonly)
1252 : count_pending = false;
1253 : else
1254 0 : count_pending = true;
1255 : } else {
1256 0 : if (left_delonly)
1257 0 : first_lclu++;
1258 0 : if (right_delonly)
1259 0 : last_lclu--;
1260 0 : if (first_lclu <= last_lclu)
1261 : count_pending = true;
1262 : else
1263 0 : count_pending = false;
1264 : }
1265 :
1266 : /*
1267 : * a pending reservation found between first_lclu and last_lclu
1268 : * represents an allocated cluster that contained at least one
1269 : * delonly block, so the delonly total must be reduced by one
1270 : * for each pending reservation found and released
1271 : */
1272 0 : if (count_pending) {
1273 0 : pr = __pr_tree_search(&tree->root, first_lclu);
1274 0 : while (pr && pr->lclu <= last_lclu) {
1275 0 : rc->ndelonly--;
1276 0 : node = rb_next(&pr->rb_node);
1277 0 : rb_erase(&pr->rb_node, &tree->root);
1278 0 : kmem_cache_free(ext4_pending_cachep, pr);
1279 0 : if (!node)
1280 : break;
1281 0 : pr = rb_entry(node, struct pending_reservation,
1282 : rb_node);
1283 : }
1284 : }
1285 : }
1286 154 : return rc->ndelonly;
1287 : }
1288 :
1289 :
1290 : /*
1291 : * __es_remove_extent - removes block range from extent status tree
1292 : *
1293 : * @inode - file containing range
1294 : * @lblk - first block in range
1295 : * @end - last block in range
1296 : * @reserved - number of cluster reservations released
1297 : *
1298 : * If @reserved is not NULL and delayed allocation is enabled, counts
1299 : * block/cluster reservations freed by removing range and if bigalloc
1300 : * enabled cancels pending reservations as needed. Returns 0 on success,
1301 : * error code on failure.
1302 : */
1303 4007 : static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1304 : ext4_lblk_t end, int *reserved)
1305 : {
1306 4007 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1307 4007 : struct rb_node *node;
1308 4007 : struct extent_status *es;
1309 4007 : struct extent_status orig_es;
1310 4007 : ext4_lblk_t len1, len2;
1311 4007 : ext4_fsblk_t block;
1312 4007 : int err;
1313 4007 : bool count_reserved = true;
1314 4007 : struct rsvd_count rc;
1315 :
1316 4007 : if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1317 4007 : count_reserved = false;
1318 4007 : retry:
1319 4007 : err = 0;
1320 :
1321 4007 : es = __es_tree_search(&tree->root, lblk);
1322 4007 : if (!es)
1323 802 : goto out;
1324 3205 : if (es->es_lblk > end)
1325 0 : goto out;
1326 :
1327 : /* Simply invalidate cache_es. */
1328 3205 : tree->cache_es = NULL;
1329 3205 : if (count_reserved)
1330 154 : init_rsvd(inode, lblk, es, &rc);
1331 :
1332 3205 : orig_es.es_lblk = es->es_lblk;
1333 3205 : orig_es.es_len = es->es_len;
1334 3205 : orig_es.es_pblk = es->es_pblk;
1335 :
1336 3205 : len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
1337 3205 : len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
1338 3205 : if (len1 > 0)
1339 3 : es->es_len = len1;
1340 3205 : if (len2 > 0) {
1341 2082 : if (len1 > 0) {
1342 0 : struct extent_status newes;
1343 :
1344 0 : newes.es_lblk = end + 1;
1345 0 : newes.es_len = len2;
1346 0 : block = 0x7FDEADBEEFULL;
1347 0 : if (ext4_es_is_written(&orig_es) ||
1348 0 : ext4_es_is_unwritten(&orig_es))
1349 0 : block = ext4_es_pblock(&orig_es) +
1350 0 : orig_es.es_len - len2;
1351 0 : ext4_es_store_pblock_status(&newes, block,
1352 : ext4_es_status(&orig_es));
1353 0 : err = __es_insert_extent(inode, &newes);
1354 0 : if (err) {
1355 0 : es->es_lblk = orig_es.es_lblk;
1356 0 : es->es_len = orig_es.es_len;
1357 0 : if ((err == -ENOMEM) &&
1358 0 : __es_shrink(EXT4_SB(inode->i_sb),
1359 0 : 128, EXT4_I(inode)))
1360 0 : goto retry;
1361 0 : goto out;
1362 : }
1363 : } else {
1364 2082 : es->es_lblk = end + 1;
1365 2082 : es->es_len = len2;
1366 2082 : if (ext4_es_is_written(es) ||
1367 1883 : ext4_es_is_unwritten(es)) {
1368 199 : block = orig_es.es_pblk + orig_es.es_len - len2;
1369 199 : ext4_es_store_pblock(es, block);
1370 : }
1371 : }
1372 2082 : if (count_reserved)
1373 0 : count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
1374 : &orig_es, &rc);
1375 2082 : goto out;
1376 : }
1377 :
1378 1123 : if (len1 > 0) {
1379 3 : if (count_reserved)
1380 0 : count_rsvd(inode, lblk, orig_es.es_len - len1,
1381 : &orig_es, &rc);
1382 3 : node = rb_next(&es->rb_node);
1383 3 : if (node)
1384 1438 : es = rb_entry(node, struct extent_status, rb_node);
1385 : else
1386 3 : es = NULL;
1387 : }
1388 :
1389 1435 : while (es && ext4_es_end(es) <= end) {
1390 1275 : if (count_reserved)
1391 309 : count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1392 1275 : node = rb_next(&es->rb_node);
1393 1275 : rb_erase(&es->rb_node, &tree->root);
1394 1275 : ext4_es_free_extent(inode, es);
1395 1275 : if (!node) {
1396 : es = NULL;
1397 : break;
1398 : }
1399 1438 : es = rb_entry(node, struct extent_status, rb_node);
1400 : }
1401 :
1402 1123 : if (es && es->es_lblk < end + 1) {
1403 0 : ext4_lblk_t orig_len = es->es_len;
1404 :
1405 0 : len1 = ext4_es_end(es) - end;
1406 0 : if (count_reserved)
1407 0 : count_rsvd(inode, es->es_lblk, orig_len - len1,
1408 : es, &rc);
1409 0 : es->es_lblk = end + 1;
1410 0 : es->es_len = len1;
1411 0 : if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
1412 0 : block = es->es_pblk + orig_len - len1;
1413 0 : ext4_es_store_pblock(es, block);
1414 : }
1415 : }
1416 :
1417 1123 : if (count_reserved)
1418 154 : *reserved = get_rsvd(inode, end, es, &rc);
1419 969 : out:
1420 4007 : return err;
1421 : }
1422 :
1423 : /*
1424 : * ext4_es_remove_extent - removes block range from extent status tree
1425 : *
1426 : * @inode - file containing range
1427 : * @lblk - first block in range
1428 : * @len - number of blocks to remove
1429 : *
1430 : * Reduces block/cluster reservation count and for bigalloc cancels pending
1431 : * reservations as needed. Returns 0 on success, error code on failure.
1432 : */
1433 333 : int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1434 : ext4_lblk_t len)
1435 : {
1436 333 : ext4_lblk_t end;
1437 333 : int err = 0;
1438 333 : int reserved = 0;
1439 :
1440 333 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1441 : return 0;
1442 :
1443 333 : trace_ext4_es_remove_extent(inode, lblk, len);
1444 333 : es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
1445 : lblk, len, inode->i_ino);
1446 :
1447 333 : if (!len)
1448 : return err;
1449 :
1450 333 : end = lblk + len - 1;
1451 333 : BUG_ON(end < lblk);
1452 :
1453 : /*
1454 : * ext4_clear_inode() depends on us taking i_es_lock unconditionally
1455 : * so that we are sure __es_shrink() is done with the inode before it
1456 : * is reclaimed.
1457 : */
1458 333 : write_lock(&EXT4_I(inode)->i_es_lock);
1459 333 : err = __es_remove_extent(inode, lblk, end, &reserved);
1460 333 : write_unlock(&EXT4_I(inode)->i_es_lock);
1461 333 : ext4_es_print_tree(inode);
1462 333 : ext4_da_release_space(inode, reserved);
1463 333 : return err;
1464 : }
1465 :
1466 0 : static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
1467 : struct ext4_inode_info *locked_ei)
1468 : {
1469 0 : struct ext4_inode_info *ei;
1470 0 : struct ext4_es_stats *es_stats;
1471 0 : ktime_t start_time;
1472 0 : u64 scan_time;
1473 0 : int nr_to_walk;
1474 0 : int nr_shrunk = 0;
1475 0 : int retried = 0, nr_skipped = 0;
1476 :
1477 0 : es_stats = &sbi->s_es_stats;
1478 0 : start_time = ktime_get();
1479 :
1480 0 : retry:
1481 0 : spin_lock(&sbi->s_es_lock);
1482 0 : nr_to_walk = sbi->s_es_nr_inode;
1483 0 : while (nr_to_walk-- > 0) {
1484 0 : if (list_empty(&sbi->s_es_list)) {
1485 0 : spin_unlock(&sbi->s_es_lock);
1486 0 : goto out;
1487 : }
1488 0 : ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
1489 : i_es_list);
1490 : /* Move the inode to the tail */
1491 0 : list_move_tail(&ei->i_es_list, &sbi->s_es_list);
1492 :
1493 : /*
1494 : * Normally we try hard to avoid shrinking precached inodes,
1495 : * but we will as a last resort.
1496 : */
1497 0 : if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1498 : EXT4_STATE_EXT_PRECACHED)) {
1499 0 : nr_skipped++;
1500 0 : continue;
1501 : }
1502 :
1503 0 : if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1504 0 : nr_skipped++;
1505 0 : continue;
1506 : }
1507 : /*
1508 : * Now we hold i_es_lock which protects us from inode reclaim
1509 : * freeing inode under us
1510 : */
1511 0 : spin_unlock(&sbi->s_es_lock);
1512 :
1513 0 : nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1514 0 : write_unlock(&ei->i_es_lock);
1515 :
1516 0 : if (nr_to_scan <= 0)
1517 0 : goto out;
1518 0 : spin_lock(&sbi->s_es_lock);
1519 : }
1520 0 : spin_unlock(&sbi->s_es_lock);
1521 :
1522 : /*
1523 : * If we skipped any inodes, and we weren't able to make any
1524 : * forward progress, try again to scan precached inodes.
1525 : */
1526 0 : if ((nr_shrunk == 0) && nr_skipped && !retried) {
1527 0 : retried++;
1528 0 : goto retry;
1529 : }
1530 :
1531 0 : if (locked_ei && nr_shrunk == 0)
1532 0 : nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1533 :
1534 0 : out:
1535 0 : scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1536 0 : if (likely(es_stats->es_stats_scan_time))
1537 0 : es_stats->es_stats_scan_time = (scan_time +
1538 0 : es_stats->es_stats_scan_time*3) / 4;
1539 : else
1540 0 : es_stats->es_stats_scan_time = scan_time;
1541 0 : if (scan_time > es_stats->es_stats_max_scan_time)
1542 0 : es_stats->es_stats_max_scan_time = scan_time;
1543 0 : if (likely(es_stats->es_stats_shrunk))
1544 0 : es_stats->es_stats_shrunk = (nr_shrunk +
1545 0 : es_stats->es_stats_shrunk*3) / 4;
1546 : else
1547 0 : es_stats->es_stats_shrunk = nr_shrunk;
1548 :
1549 0 : trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1550 : nr_skipped, retried);
1551 0 : return nr_shrunk;
1552 : }
1553 :
1554 0 : static unsigned long ext4_es_count(struct shrinker *shrink,
1555 : struct shrink_control *sc)
1556 : {
1557 0 : unsigned long nr;
1558 0 : struct ext4_sb_info *sbi;
1559 :
1560 0 : sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1561 0 : nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1562 0 : trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1563 0 : return nr;
1564 : }
1565 :
1566 0 : static unsigned long ext4_es_scan(struct shrinker *shrink,
1567 : struct shrink_control *sc)
1568 : {
1569 0 : struct ext4_sb_info *sbi = container_of(shrink,
1570 : struct ext4_sb_info, s_es_shrinker);
1571 0 : int nr_to_scan = sc->nr_to_scan;
1572 0 : int ret, nr_shrunk;
1573 :
1574 0 : ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1575 0 : trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1576 :
1577 0 : if (!nr_to_scan)
1578 0 : return ret;
1579 :
1580 0 : nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1581 :
1582 0 : trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1583 0 : return nr_shrunk;
1584 : }
1585 :
1586 0 : int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1587 : {
1588 0 : struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1589 0 : struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1590 0 : struct ext4_inode_info *ei, *max = NULL;
1591 0 : unsigned int inode_cnt = 0;
1592 :
1593 0 : if (v != SEQ_START_TOKEN)
1594 : return 0;
1595 :
1596 : /* here we just find an inode that has the max nr. of objects */
1597 0 : spin_lock(&sbi->s_es_lock);
1598 0 : list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1599 0 : inode_cnt++;
1600 0 : if (max && max->i_es_all_nr < ei->i_es_all_nr)
1601 : max = ei;
1602 0 : else if (!max)
1603 0 : max = ei;
1604 : }
1605 0 : spin_unlock(&sbi->s_es_lock);
1606 :
1607 0 : seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1608 : percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1609 : percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1610 0 : seq_printf(seq, " %lld/%lld cache hits/misses\n",
1611 : percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1612 : percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1613 0 : if (inode_cnt)
1614 0 : seq_printf(seq, " %d inodes on list\n", inode_cnt);
1615 :
1616 0 : seq_printf(seq, "average:\n %llu us scan time\n",
1617 : div_u64(es_stats->es_stats_scan_time, 1000));
1618 0 : seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
1619 0 : if (inode_cnt)
1620 0 : seq_printf(seq,
1621 : "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1622 : " %llu us max scan time\n",
1623 : max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1624 : div_u64(es_stats->es_stats_max_scan_time, 1000));
1625 :
1626 : return 0;
1627 : }
1628 :
1629 1 : int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1630 : {
1631 1 : int err;
1632 :
1633 : /* Make sure we have enough bits for physical block number */
1634 1 : BUILD_BUG_ON(ES_SHIFT < 48);
1635 1 : INIT_LIST_HEAD(&sbi->s_es_list);
1636 1 : sbi->s_es_nr_inode = 0;
1637 1 : spin_lock_init(&sbi->s_es_lock);
1638 1 : sbi->s_es_stats.es_stats_shrunk = 0;
1639 1 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1640 : GFP_KERNEL);
1641 1 : if (err)
1642 : return err;
1643 1 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1644 : GFP_KERNEL);
1645 1 : if (err)
1646 0 : goto err1;
1647 1 : sbi->s_es_stats.es_stats_scan_time = 0;
1648 1 : sbi->s_es_stats.es_stats_max_scan_time = 0;
1649 1 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1650 1 : if (err)
1651 0 : goto err2;
1652 1 : err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1653 1 : if (err)
1654 0 : goto err3;
1655 :
1656 1 : sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1657 1 : sbi->s_es_shrinker.count_objects = ext4_es_count;
1658 1 : sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1659 1 : err = register_shrinker(&sbi->s_es_shrinker);
1660 1 : if (err)
1661 0 : goto err4;
1662 :
1663 : return 0;
1664 0 : err4:
1665 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1666 0 : err3:
1667 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1668 0 : err2:
1669 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1670 0 : err1:
1671 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1672 0 : return err;
1673 : }
1674 :
1675 0 : void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1676 : {
1677 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1678 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1679 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1680 0 : percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1681 0 : unregister_shrinker(&sbi->s_es_shrinker);
1682 0 : }
1683 :
1684 : /*
1685 : * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1686 : * most *nr_to_scan extents, update *nr_to_scan accordingly.
1687 : *
1688 : * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1689 : * Increment *nr_shrunk by the number of reclaimed extents. Also update
1690 : * ei->i_es_shrink_lblk to where we should continue scanning.
1691 : */
1692 0 : static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1693 : int *nr_to_scan, int *nr_shrunk)
1694 : {
1695 0 : struct inode *inode = &ei->vfs_inode;
1696 0 : struct ext4_es_tree *tree = &ei->i_es_tree;
1697 0 : struct extent_status *es;
1698 0 : struct rb_node *node;
1699 :
1700 0 : es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1701 0 : if (!es)
1702 0 : goto out_wrap;
1703 :
1704 0 : while (*nr_to_scan > 0) {
1705 0 : if (es->es_lblk > end) {
1706 0 : ei->i_es_shrink_lblk = end + 1;
1707 0 : return 0;
1708 : }
1709 :
1710 0 : (*nr_to_scan)--;
1711 0 : node = rb_next(&es->rb_node);
1712 : /*
1713 : * We can't reclaim delayed extent from status tree because
1714 : * fiemap, bigallic, and seek_data/hole need to use it.
1715 : */
1716 0 : if (ext4_es_is_delayed(es))
1717 0 : goto next;
1718 0 : if (ext4_es_is_referenced(es)) {
1719 0 : ext4_es_clear_referenced(es);
1720 0 : goto next;
1721 : }
1722 :
1723 0 : rb_erase(&es->rb_node, &tree->root);
1724 0 : ext4_es_free_extent(inode, es);
1725 0 : (*nr_shrunk)++;
1726 0 : next:
1727 0 : if (!node)
1728 0 : goto out_wrap;
1729 0 : es = rb_entry(node, struct extent_status, rb_node);
1730 : }
1731 0 : ei->i_es_shrink_lblk = es->es_lblk;
1732 0 : return 1;
1733 0 : out_wrap:
1734 0 : ei->i_es_shrink_lblk = 0;
1735 0 : return 0;
1736 : }
1737 :
1738 0 : static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1739 : {
1740 0 : struct inode *inode = &ei->vfs_inode;
1741 0 : int nr_shrunk = 0;
1742 0 : ext4_lblk_t start = ei->i_es_shrink_lblk;
1743 0 : static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1744 : DEFAULT_RATELIMIT_BURST);
1745 :
1746 0 : if (ei->i_es_shk_nr == 0)
1747 : return 0;
1748 :
1749 0 : if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1750 0 : __ratelimit(&_rs))
1751 0 : ext4_warning(inode->i_sb, "forced shrink of precached extents");
1752 :
1753 0 : if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1754 : start != 0)
1755 0 : es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1756 :
1757 0 : ei->i_es_tree.cache_es = NULL;
1758 0 : return nr_shrunk;
1759 : }
1760 :
1761 : /*
1762 : * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
1763 : * discretionary entries from the extent status cache. (Some entries
1764 : * must be present for proper operations.)
1765 : */
1766 0 : void ext4_clear_inode_es(struct inode *inode)
1767 : {
1768 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1769 0 : struct extent_status *es;
1770 0 : struct ext4_es_tree *tree;
1771 0 : struct rb_node *node;
1772 :
1773 0 : write_lock(&ei->i_es_lock);
1774 0 : tree = &EXT4_I(inode)->i_es_tree;
1775 0 : tree->cache_es = NULL;
1776 0 : node = rb_first(&tree->root);
1777 0 : while (node) {
1778 0 : es = rb_entry(node, struct extent_status, rb_node);
1779 0 : node = rb_next(node);
1780 0 : if (!ext4_es_is_delayed(es)) {
1781 0 : rb_erase(&es->rb_node, &tree->root);
1782 0 : ext4_es_free_extent(inode, es);
1783 : }
1784 : }
1785 0 : ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1786 0 : write_unlock(&ei->i_es_lock);
1787 0 : }
1788 :
1789 : #ifdef ES_DEBUG__
1790 : static void ext4_print_pending_tree(struct inode *inode)
1791 : {
1792 : struct ext4_pending_tree *tree;
1793 : struct rb_node *node;
1794 : struct pending_reservation *pr;
1795 :
1796 : printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
1797 : tree = &EXT4_I(inode)->i_pending_tree;
1798 : node = rb_first(&tree->root);
1799 : while (node) {
1800 : pr = rb_entry(node, struct pending_reservation, rb_node);
1801 : printk(KERN_DEBUG " %u", pr->lclu);
1802 : node = rb_next(node);
1803 : }
1804 : printk(KERN_DEBUG "\n");
1805 : }
1806 : #else
1807 : #define ext4_print_pending_tree(inode)
1808 : #endif
1809 :
1810 1 : int __init ext4_init_pending(void)
1811 : {
1812 1 : ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
1813 : sizeof(struct pending_reservation),
1814 : 0, (SLAB_RECLAIM_ACCOUNT), NULL);
1815 1 : if (ext4_pending_cachep == NULL)
1816 0 : return -ENOMEM;
1817 : return 0;
1818 : }
1819 :
1820 0 : void ext4_exit_pending(void)
1821 : {
1822 0 : kmem_cache_destroy(ext4_pending_cachep);
1823 0 : }
1824 :
1825 5559 : void ext4_init_pending_tree(struct ext4_pending_tree *tree)
1826 : {
1827 5559 : tree->root = RB_ROOT;
1828 5559 : }
1829 :
1830 : /*
1831 : * __get_pending - retrieve a pointer to a pending reservation
1832 : *
1833 : * @inode - file containing the pending cluster reservation
1834 : * @lclu - logical cluster of interest
1835 : *
1836 : * Returns a pointer to a pending reservation if it's a member of
1837 : * the set, and NULL if not. Must be called holding i_es_lock.
1838 : */
1839 0 : static struct pending_reservation *__get_pending(struct inode *inode,
1840 : ext4_lblk_t lclu)
1841 : {
1842 0 : struct ext4_pending_tree *tree;
1843 0 : struct rb_node *node;
1844 0 : struct pending_reservation *pr = NULL;
1845 :
1846 0 : tree = &EXT4_I(inode)->i_pending_tree;
1847 0 : node = (&tree->root)->rb_node;
1848 :
1849 0 : while (node) {
1850 0 : pr = rb_entry(node, struct pending_reservation, rb_node);
1851 0 : if (lclu < pr->lclu)
1852 0 : node = node->rb_left;
1853 0 : else if (lclu > pr->lclu)
1854 0 : node = node->rb_right;
1855 0 : else if (lclu == pr->lclu)
1856 0 : return pr;
1857 : }
1858 : return NULL;
1859 : }
1860 :
1861 : /*
1862 : * __insert_pending - adds a pending cluster reservation to the set of
1863 : * pending reservations
1864 : *
1865 : * @inode - file containing the cluster
1866 : * @lblk - logical block in the cluster to be added
1867 : *
1868 : * Returns 0 on successful insertion and -ENOMEM on failure. If the
1869 : * pending reservation is already in the set, returns successfully.
1870 : */
1871 0 : static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
1872 : {
1873 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1874 0 : struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1875 0 : struct rb_node **p = &tree->root.rb_node;
1876 0 : struct rb_node *parent = NULL;
1877 0 : struct pending_reservation *pr;
1878 0 : ext4_lblk_t lclu;
1879 0 : int ret = 0;
1880 :
1881 0 : lclu = EXT4_B2C(sbi, lblk);
1882 : /* search to find parent for insertion */
1883 0 : while (*p) {
1884 0 : parent = *p;
1885 0 : pr = rb_entry(parent, struct pending_reservation, rb_node);
1886 :
1887 0 : if (lclu < pr->lclu) {
1888 0 : p = &(*p)->rb_left;
1889 0 : } else if (lclu > pr->lclu) {
1890 0 : p = &(*p)->rb_right;
1891 : } else {
1892 : /* pending reservation already inserted */
1893 0 : goto out;
1894 : }
1895 : }
1896 :
1897 0 : pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
1898 0 : if (pr == NULL) {
1899 0 : ret = -ENOMEM;
1900 0 : goto out;
1901 : }
1902 0 : pr->lclu = lclu;
1903 :
1904 0 : rb_link_node(&pr->rb_node, parent, p);
1905 0 : rb_insert_color(&pr->rb_node, &tree->root);
1906 :
1907 0 : out:
1908 0 : return ret;
1909 : }
1910 :
1911 : /*
1912 : * __remove_pending - removes a pending cluster reservation from the set
1913 : * of pending reservations
1914 : *
1915 : * @inode - file containing the cluster
1916 : * @lblk - logical block in the pending cluster reservation to be removed
1917 : *
1918 : * Returns successfully if pending reservation is not a member of the set.
1919 : */
1920 0 : static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
1921 : {
1922 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1923 0 : struct pending_reservation *pr;
1924 0 : struct ext4_pending_tree *tree;
1925 :
1926 0 : pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
1927 0 : if (pr != NULL) {
1928 0 : tree = &EXT4_I(inode)->i_pending_tree;
1929 0 : rb_erase(&pr->rb_node, &tree->root);
1930 0 : kmem_cache_free(ext4_pending_cachep, pr);
1931 : }
1932 0 : }
1933 :
1934 : /*
1935 : * ext4_remove_pending - removes a pending cluster reservation from the set
1936 : * of pending reservations
1937 : *
1938 : * @inode - file containing the cluster
1939 : * @lblk - logical block in the pending cluster reservation to be removed
1940 : *
1941 : * Locking for external use of __remove_pending.
1942 : */
1943 0 : void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
1944 : {
1945 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1946 :
1947 0 : write_lock(&ei->i_es_lock);
1948 0 : __remove_pending(inode, lblk);
1949 0 : write_unlock(&ei->i_es_lock);
1950 0 : }
1951 :
1952 : /*
1953 : * ext4_is_pending - determine whether a cluster has a pending reservation
1954 : * on it
1955 : *
1956 : * @inode - file containing the cluster
1957 : * @lblk - logical block in the cluster
1958 : *
1959 : * Returns true if there's a pending reservation for the cluster in the
1960 : * set of pending reservations, and false if not.
1961 : */
1962 0 : bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
1963 : {
1964 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1965 0 : struct ext4_inode_info *ei = EXT4_I(inode);
1966 0 : bool ret;
1967 :
1968 0 : read_lock(&ei->i_es_lock);
1969 0 : ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
1970 0 : read_unlock(&ei->i_es_lock);
1971 :
1972 0 : return ret;
1973 : }
1974 :
1975 : /*
1976 : * ext4_es_insert_delayed_block - adds a delayed block to the extents status
1977 : * tree, adding a pending reservation where
1978 : * needed
1979 : *
1980 : * @inode - file containing the newly added block
1981 : * @lblk - logical block to be added
1982 : * @allocated - indicates whether a physical cluster has been allocated for
1983 : * the logical cluster that contains the block
1984 : *
1985 : * Returns 0 on success, negative error code on failure.
1986 : */
1987 1703 : int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
1988 : bool allocated)
1989 : {
1990 1703 : struct extent_status newes;
1991 1703 : int err = 0;
1992 :
1993 1703 : if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1994 : return 0;
1995 :
1996 1703 : es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
1997 : lblk, inode->i_ino);
1998 :
1999 1703 : newes.es_lblk = lblk;
2000 1703 : newes.es_len = 1;
2001 1703 : ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
2002 1703 : trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
2003 :
2004 1703 : ext4_es_insert_extent_check(inode, &newes);
2005 :
2006 1703 : write_lock(&EXT4_I(inode)->i_es_lock);
2007 :
2008 1703 : err = __es_remove_extent(inode, lblk, lblk, NULL);
2009 1703 : if (err != 0)
2010 0 : goto error;
2011 1703 : retry:
2012 1703 : err = __es_insert_extent(inode, &newes);
2013 1703 : if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
2014 0 : 128, EXT4_I(inode)))
2015 0 : goto retry;
2016 1703 : if (err != 0)
2017 0 : goto error;
2018 :
2019 1703 : if (allocated)
2020 0 : __insert_pending(inode, lblk);
2021 :
2022 1703 : error:
2023 1703 : write_unlock(&EXT4_I(inode)->i_es_lock);
2024 :
2025 1703 : ext4_es_print_tree(inode);
2026 1703 : ext4_print_pending_tree(inode);
2027 :
2028 1703 : return err;
2029 : }
2030 :
2031 : /*
2032 : * __es_delayed_clu - count number of clusters containing blocks that
2033 : * are delayed only
2034 : *
2035 : * @inode - file containing block range
2036 : * @start - logical block defining start of range
2037 : * @end - logical block defining end of range
2038 : *
2039 : * Returns the number of clusters containing only delayed (not delayed
2040 : * and unwritten) blocks in the range specified by @start and @end. Any
2041 : * cluster or part of a cluster within the range and containing a delayed
2042 : * and not unwritten block within the range is counted as a whole cluster.
2043 : */
2044 173 : static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
2045 : ext4_lblk_t end)
2046 : {
2047 173 : struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
2048 173 : struct extent_status *es;
2049 173 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2050 173 : struct rb_node *node;
2051 173 : ext4_lblk_t first_lclu, last_lclu;
2052 173 : unsigned long long last_counted_lclu;
2053 173 : unsigned int n = 0;
2054 :
2055 : /* guaranteed to be unequal to any ext4_lblk_t value */
2056 173 : last_counted_lclu = ~0ULL;
2057 :
2058 173 : es = __es_tree_search(&tree->root, start);
2059 :
2060 346 : while (es && (es->es_lblk <= end)) {
2061 173 : if (ext4_es_is_delonly(es)) {
2062 0 : if (es->es_lblk <= start)
2063 0 : first_lclu = EXT4_B2C(sbi, start);
2064 : else
2065 0 : first_lclu = EXT4_B2C(sbi, es->es_lblk);
2066 :
2067 0 : if (ext4_es_end(es) >= end)
2068 0 : last_lclu = EXT4_B2C(sbi, end);
2069 : else
2070 0 : last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
2071 :
2072 0 : if (first_lclu == last_counted_lclu)
2073 0 : n += last_lclu - first_lclu;
2074 : else
2075 0 : n += last_lclu - first_lclu + 1;
2076 0 : last_counted_lclu = last_lclu;
2077 : }
2078 173 : node = rb_next(&es->rb_node);
2079 173 : if (!node)
2080 : break;
2081 173 : es = rb_entry(node, struct extent_status, rb_node);
2082 : }
2083 :
2084 173 : return n;
2085 : }
2086 :
2087 : /*
2088 : * ext4_es_delayed_clu - count number of clusters containing blocks that
2089 : * are both delayed and unwritten
2090 : *
2091 : * @inode - file containing block range
2092 : * @lblk - logical block defining start of range
2093 : * @len - number of blocks in range
2094 : *
2095 : * Locking for external use of __es_delayed_clu().
2096 : */
2097 173 : unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
2098 : ext4_lblk_t len)
2099 : {
2100 173 : struct ext4_inode_info *ei = EXT4_I(inode);
2101 173 : ext4_lblk_t end;
2102 173 : unsigned int n;
2103 :
2104 173 : if (len == 0)
2105 : return 0;
2106 :
2107 173 : end = lblk + len - 1;
2108 173 : WARN_ON(end < lblk);
2109 :
2110 173 : read_lock(&ei->i_es_lock);
2111 :
2112 173 : n = __es_delayed_clu(inode, lblk, end);
2113 :
2114 173 : read_unlock(&ei->i_es_lock);
2115 :
2116 173 : return n;
2117 : }
2118 :
2119 : /*
2120 : * __revise_pending - makes, cancels, or leaves unchanged pending cluster
2121 : * reservations for a specified block range depending
2122 : * upon the presence or absence of delayed blocks
2123 : * outside the range within clusters at the ends of the
2124 : * range
2125 : *
2126 : * @inode - file containing the range
2127 : * @lblk - logical block defining the start of range
2128 : * @len - length of range in blocks
2129 : *
2130 : * Used after a newly allocated extent is added to the extents status tree.
2131 : * Requires that the extents in the range have either written or unwritten
2132 : * status. Must be called while holding i_es_lock.
2133 : */
2134 0 : static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
2135 : ext4_lblk_t len)
2136 : {
2137 0 : struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2138 0 : ext4_lblk_t end = lblk + len - 1;
2139 0 : ext4_lblk_t first, last;
2140 0 : bool f_del = false, l_del = false;
2141 :
2142 0 : if (len == 0)
2143 : return;
2144 :
2145 : /*
2146 : * Two cases - block range within single cluster and block range
2147 : * spanning two or more clusters. Note that a cluster belonging
2148 : * to a range starting and/or ending on a cluster boundary is treated
2149 : * as if it does not contain a delayed extent. The new range may
2150 : * have allocated space for previously delayed blocks out to the
2151 : * cluster boundary, requiring that any pre-existing pending
2152 : * reservation be canceled. Because this code only looks at blocks
2153 : * outside the range, it should revise pending reservations
2154 : * correctly even if the extent represented by the range can't be
2155 : * inserted in the extents status tree due to ENOSPC.
2156 : */
2157 :
2158 0 : if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
2159 0 : first = EXT4_LBLK_CMASK(sbi, lblk);
2160 0 : if (first != lblk)
2161 0 : f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2162 : first, lblk - 1);
2163 0 : if (f_del) {
2164 0 : __insert_pending(inode, first);
2165 : } else {
2166 0 : last = EXT4_LBLK_CMASK(sbi, end) +
2167 : sbi->s_cluster_ratio - 1;
2168 0 : if (last != end)
2169 0 : l_del = __es_scan_range(inode,
2170 : &ext4_es_is_delonly,
2171 : end + 1, last);
2172 0 : if (l_del)
2173 0 : __insert_pending(inode, last);
2174 : else
2175 0 : __remove_pending(inode, last);
2176 : }
2177 : } else {
2178 0 : first = EXT4_LBLK_CMASK(sbi, lblk);
2179 0 : if (first != lblk)
2180 0 : f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2181 : first, lblk - 1);
2182 0 : if (f_del)
2183 0 : __insert_pending(inode, first);
2184 : else
2185 0 : __remove_pending(inode, first);
2186 :
2187 0 : last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
2188 0 : if (last != end)
2189 0 : l_del = __es_scan_range(inode, &ext4_es_is_delonly,
2190 : end + 1, last);
2191 0 : if (l_del)
2192 0 : __insert_pending(inode, last);
2193 : else
2194 0 : __remove_pending(inode, last);
2195 : }
2196 : }
|