Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * High-level sync()-related operations
4 : */
5 :
6 : #include <linux/kernel.h>
7 : #include <linux/file.h>
8 : #include <linux/fs.h>
9 : #include <linux/slab.h>
10 : #include <linux/export.h>
11 : #include <linux/namei.h>
12 : #include <linux/sched.h>
13 : #include <linux/writeback.h>
14 : #include <linux/syscalls.h>
15 : #include <linux/linkage.h>
16 : #include <linux/pagemap.h>
17 : #include <linux/quotaops.h>
18 : #include <linux/backing-dev.h>
19 : #include "internal.h"
20 :
21 : #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
22 : SYNC_FILE_RANGE_WAIT_AFTER)
23 :
24 : /*
25 : * Do the filesystem syncing work. For simple filesystems
26 : * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
27 : * submit IO for these buffers via __sync_blockdev(). This also speeds up the
28 : * wait == 1 case since in that case write_inode() functions do
29 : * sync_dirty_buffer() and thus effectively write one block at a time.
30 : */
31 202 : static int __sync_filesystem(struct super_block *sb, int wait)
32 : {
33 202 : if (wait)
34 101 : sync_inodes_sb(sb);
35 : else
36 101 : writeback_inodes_sb(sb, WB_REASON_SYNC);
37 :
38 202 : if (sb->s_op->sync_fs)
39 4 : sb->s_op->sync_fs(sb, wait);
40 202 : return __sync_blockdev(sb->s_bdev, wait);
41 : }
42 :
43 : /*
44 : * Write out and wait upon all dirty data associated with this
45 : * superblock. Filesystem data as well as the underlying block
46 : * device. Takes the superblock lock.
47 : */
48 101 : int sync_filesystem(struct super_block *sb)
49 : {
50 101 : int ret;
51 :
52 : /*
53 : * We need to be protected against the filesystem going from
54 : * r/o to r/w or vice versa.
55 : */
56 101 : WARN_ON(!rwsem_is_locked(&sb->s_umount));
57 :
58 : /*
59 : * No point in syncing out anything if the filesystem is read-only.
60 : */
61 101 : if (sb_rdonly(sb))
62 : return 0;
63 :
64 101 : ret = __sync_filesystem(sb, 0);
65 101 : if (ret < 0)
66 : return ret;
67 101 : return __sync_filesystem(sb, 1);
68 : }
69 : EXPORT_SYMBOL(sync_filesystem);
70 :
71 0 : static void sync_inodes_one_sb(struct super_block *sb, void *arg)
72 : {
73 0 : if (!sb_rdonly(sb))
74 0 : sync_inodes_sb(sb);
75 0 : }
76 :
77 0 : static void sync_fs_one_sb(struct super_block *sb, void *arg)
78 : {
79 0 : if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
80 0 : sb->s_op->sync_fs)
81 0 : sb->s_op->sync_fs(sb, *(int *)arg);
82 0 : }
83 :
84 0 : static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
85 : {
86 0 : filemap_fdatawrite(bdev->bd_inode->i_mapping);
87 0 : }
88 :
89 0 : static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
90 : {
91 : /*
92 : * We keep the error status of individual mapping so that
93 : * applications can catch the writeback error using fsync(2).
94 : * See filemap_fdatawait_keep_errors() for details.
95 : */
96 0 : filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
97 0 : }
98 :
99 : /*
100 : * Sync everything. We start by waking flusher threads so that most of
101 : * writeback runs on all devices in parallel. Then we sync all inodes reliably
102 : * which effectively also waits for all flusher threads to finish doing
103 : * writeback. At this point all data is on disk so metadata should be stable
104 : * and we tell filesystems to sync their metadata via ->sync_fs() calls.
105 : * Finally, we writeout all block devices because some filesystems (e.g. ext2)
106 : * just write metadata (such as inodes or bitmaps) to block device page cache
107 : * and do not sync it on their own in ->sync_fs().
108 : */
109 0 : void ksys_sync(void)
110 : {
111 0 : int nowait = 0, wait = 1;
112 :
113 0 : wakeup_flusher_threads(WB_REASON_SYNC);
114 0 : iterate_supers(sync_inodes_one_sb, NULL);
115 0 : iterate_supers(sync_fs_one_sb, &nowait);
116 0 : iterate_supers(sync_fs_one_sb, &wait);
117 0 : iterate_bdevs(fdatawrite_one_bdev, NULL);
118 0 : iterate_bdevs(fdatawait_one_bdev, NULL);
119 0 : if (unlikely(laptop_mode))
120 0 : laptop_sync_completion();
121 0 : }
122 :
123 0 : SYSCALL_DEFINE0(sync)
124 : {
125 0 : ksys_sync();
126 0 : return 0;
127 : }
128 :
129 0 : static void do_sync_work(struct work_struct *work)
130 : {
131 0 : int nowait = 0;
132 :
133 : /*
134 : * Sync twice to reduce the possibility we skipped some inodes / pages
135 : * because they were temporarily locked
136 : */
137 0 : iterate_supers(sync_inodes_one_sb, &nowait);
138 0 : iterate_supers(sync_fs_one_sb, &nowait);
139 0 : iterate_bdevs(fdatawrite_one_bdev, NULL);
140 0 : iterate_supers(sync_inodes_one_sb, &nowait);
141 0 : iterate_supers(sync_fs_one_sb, &nowait);
142 0 : iterate_bdevs(fdatawrite_one_bdev, NULL);
143 0 : printk("Emergency Sync complete\n");
144 0 : kfree(work);
145 0 : }
146 :
147 0 : void emergency_sync(void)
148 : {
149 0 : struct work_struct *work;
150 :
151 0 : work = kmalloc(sizeof(*work), GFP_ATOMIC);
152 0 : if (work) {
153 0 : INIT_WORK(work, do_sync_work);
154 0 : schedule_work(work);
155 : }
156 0 : }
157 :
158 : /*
159 : * sync a single super
160 : */
161 0 : SYSCALL_DEFINE1(syncfs, int, fd)
162 : {
163 0 : struct fd f = fdget(fd);
164 0 : struct super_block *sb;
165 0 : int ret, ret2;
166 :
167 0 : if (!f.file)
168 : return -EBADF;
169 0 : sb = f.file->f_path.dentry->d_sb;
170 :
171 0 : down_read(&sb->s_umount);
172 0 : ret = sync_filesystem(sb);
173 0 : up_read(&sb->s_umount);
174 :
175 0 : ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
176 :
177 0 : fdput(f);
178 0 : return ret ? ret : ret2;
179 : }
180 :
181 : /**
182 : * vfs_fsync_range - helper to sync a range of data & metadata to disk
183 : * @file: file to sync
184 : * @start: offset in bytes of the beginning of data range to sync
185 : * @end: offset in bytes of the end of data range (inclusive)
186 : * @datasync: perform only datasync
187 : *
188 : * Write back data in range @start..@end and metadata for @file to disk. If
189 : * @datasync is set only metadata needed to access modified file data is
190 : * written.
191 : */
192 125 : int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
193 : {
194 125 : struct inode *inode = file->f_mapping->host;
195 :
196 125 : if (!file->f_op->fsync)
197 : return -EINVAL;
198 125 : if (!datasync && (inode->i_state & I_DIRTY_TIME))
199 0 : mark_inode_dirty_sync(inode);
200 125 : return file->f_op->fsync(file, start, end, datasync);
201 : }
202 : EXPORT_SYMBOL(vfs_fsync_range);
203 :
204 : /**
205 : * vfs_fsync - perform a fsync or fdatasync on a file
206 : * @file: file to sync
207 : * @datasync: only perform a fdatasync operation
208 : *
209 : * Write back data and metadata for @file to disk. If @datasync is
210 : * set only metadata needed to access modified file data is written.
211 : */
212 44 : int vfs_fsync(struct file *file, int datasync)
213 : {
214 0 : return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
215 : }
216 : EXPORT_SYMBOL(vfs_fsync);
217 :
218 44 : static int do_fsync(unsigned int fd, int datasync)
219 : {
220 44 : struct fd f = fdget(fd);
221 44 : int ret = -EBADF;
222 :
223 44 : if (f.file) {
224 44 : ret = vfs_fsync(f.file, datasync);
225 44 : fdput(f);
226 : }
227 44 : return ret;
228 : }
229 :
230 88 : SYSCALL_DEFINE1(fsync, unsigned int, fd)
231 : {
232 44 : return do_fsync(fd, 0);
233 : }
234 :
235 0 : SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
236 : {
237 0 : return do_fsync(fd, 1);
238 : }
239 :
240 0 : int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
241 : unsigned int flags)
242 : {
243 0 : int ret;
244 0 : struct address_space *mapping;
245 0 : loff_t endbyte; /* inclusive */
246 0 : umode_t i_mode;
247 :
248 0 : ret = -EINVAL;
249 0 : if (flags & ~VALID_FLAGS)
250 0 : goto out;
251 :
252 0 : endbyte = offset + nbytes;
253 :
254 0 : if ((s64)offset < 0)
255 0 : goto out;
256 0 : if ((s64)endbyte < 0)
257 0 : goto out;
258 0 : if (endbyte < offset)
259 0 : goto out;
260 :
261 0 : if (sizeof(pgoff_t) == 4) {
262 : if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
263 : /*
264 : * The range starts outside a 32 bit machine's
265 : * pagecache addressing capabilities. Let it "succeed"
266 : */
267 : ret = 0;
268 : goto out;
269 : }
270 : if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
271 : /*
272 : * Out to EOF
273 : */
274 : nbytes = 0;
275 : }
276 : }
277 :
278 0 : if (nbytes == 0)
279 : endbyte = LLONG_MAX;
280 : else
281 0 : endbyte--; /* inclusive */
282 :
283 0 : i_mode = file_inode(file)->i_mode;
284 0 : ret = -ESPIPE;
285 0 : if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
286 : !S_ISLNK(i_mode))
287 0 : goto out;
288 :
289 0 : mapping = file->f_mapping;
290 0 : ret = 0;
291 0 : if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
292 0 : ret = file_fdatawait_range(file, offset, endbyte);
293 0 : if (ret < 0)
294 0 : goto out;
295 : }
296 :
297 0 : if (flags & SYNC_FILE_RANGE_WRITE) {
298 0 : int sync_mode = WB_SYNC_NONE;
299 :
300 0 : if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
301 : SYNC_FILE_RANGE_WRITE_AND_WAIT)
302 0 : sync_mode = WB_SYNC_ALL;
303 :
304 0 : ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
305 : sync_mode);
306 0 : if (ret < 0)
307 0 : goto out;
308 : }
309 :
310 0 : if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
311 0 : ret = file_fdatawait_range(file, offset, endbyte);
312 :
313 0 : out:
314 0 : return ret;
315 : }
316 :
317 : /*
318 : * ksys_sync_file_range() permits finely controlled syncing over a segment of
319 : * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
320 : * zero then ksys_sync_file_range() will operate from offset out to EOF.
321 : *
322 : * The flag bits are:
323 : *
324 : * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
325 : * before performing the write.
326 : *
327 : * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
328 : * range which are not presently under writeback. Note that this may block for
329 : * significant periods due to exhaustion of disk request structures.
330 : *
331 : * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
332 : * after performing the write.
333 : *
334 : * Useful combinations of the flag bits are:
335 : *
336 : * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
337 : * in the range which were dirty on entry to ksys_sync_file_range() are placed
338 : * under writeout. This is a start-write-for-data-integrity operation.
339 : *
340 : * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
341 : * are not presently under writeout. This is an asynchronous flush-to-disk
342 : * operation. Not suitable for data integrity operations.
343 : *
344 : * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
345 : * completion of writeout of all pages in the range. This will be used after an
346 : * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
347 : * for that operation to complete and to return the result.
348 : *
349 : * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
350 : * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
351 : * a traditional sync() operation. This is a write-for-data-integrity operation
352 : * which will ensure that all pages in the range which were dirty on entry to
353 : * ksys_sync_file_range() are written to disk. It should be noted that disk
354 : * caches are not flushed by this call, so there are no guarantees here that the
355 : * data will be available on disk after a crash.
356 : *
357 : *
358 : * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
359 : * I/O errors or ENOSPC conditions and will return those to the caller, after
360 : * clearing the EIO and ENOSPC flags in the address_space.
361 : *
362 : * It should be noted that none of these operations write out the file's
363 : * metadata. So unless the application is strictly performing overwrites of
364 : * already-instantiated disk blocks, there are no guarantees here that the data
365 : * will be available after a crash.
366 : */
367 0 : int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
368 : unsigned int flags)
369 : {
370 0 : int ret;
371 0 : struct fd f;
372 :
373 0 : ret = -EBADF;
374 0 : f = fdget(fd);
375 0 : if (f.file)
376 0 : ret = sync_file_range(f.file, offset, nbytes, flags);
377 :
378 0 : fdput(f);
379 0 : return ret;
380 : }
381 :
382 0 : SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
383 : unsigned int, flags)
384 : {
385 0 : return ksys_sync_file_range(fd, offset, nbytes, flags);
386 : }
387 :
388 : /* It would be nice if people remember that not all the world's an i386
389 : when they introduce new system calls */
390 0 : SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
391 : loff_t, offset, loff_t, nbytes)
392 : {
393 0 : return ksys_sync_file_range(fd, offset, nbytes, flags);
394 : }
|