2 * High-level sync()-related operations
5 #include <linux/kernel.h>
6 #include <linux/file.h>
8 #include <linux/slab.h>
9 #include <linux/export.h>
10 #include <linux/namei.h>
11 #include <linux/sched.h>
12 #include <linux/writeback.h>
13 #include <linux/syscalls.h>
14 #include <linux/linkage.h>
15 #include <linux/pagemap.h>
16 #include <linux/quotaops.h>
17 #include <linux/backing-dev.h>
20 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
21 SYNC_FILE_RANGE_WAIT_AFTER)
23 /* Interruptible sync for Samsung Mobile Device */
24 #ifdef CONFIG_INTERRUPTIBLE_SYNC
26 #include <linux/workqueue.h>
27 #include <linux/suspend.h>
28 #include <linux/delay.h>
30 //#define CONFIG_INTR_SYNC_DEBUG
32 #ifdef CONFIG_INTR_SYNC_DEBUG
33 #define dbg_print printk
35 #define dbg_print(...)
39 INTR_SYNC_STATE_IDLE
= 0,
40 INTR_SYNC_STATE_QUEUED
,
41 INTR_SYNC_STATE_RUNNING
,
45 struct interruptible_sync_work
{
50 unsigned long version
;
52 struct completion done
;
53 struct work_struct work
;
56 /* Initially, intr_sync_work has zero pending */
57 static struct interruptible_sync_work intr_sync_work
[2];
59 /* Last work start time */
60 static atomic_t running_work_idx
;
62 /* intr_sync_wq will be created when intr_sync() is called at first time.
63 * And it is alive till system shutdown */
64 static struct workqueue_struct
*intr_sync_wq
;
66 /* It prevents double allocation of intr_sync_wq */
67 static DEFINE_MUTEX(intr_sync_wq_lock
);
69 static inline struct interruptible_sync_work
*INTR_SYNC_WORK(struct work_struct
*work
)
71 return container_of(work
, struct interruptible_sync_work
, work
);
74 static void do_intr_sync(struct work_struct
*work
)
76 struct interruptible_sync_work
*sync_work
= INTR_SYNC_WORK(work
);
80 spin_lock(&sync_work
->lock
);
81 atomic_set(&running_work_idx
, sync_work
->id
);
82 sync_work
->state
= INTR_SYNC_STATE_RUNNING
;
83 waiter
= sync_work
->waiter
;
84 spin_unlock(&sync_work
->lock
);
86 dbg_print("\nintr_sync: %s: call sys_sync on work[%d]-%ld\n",
87 __func__
, sync_work
->id
, sync_work
->version
);
89 /* if no one waits, do not call sync() */
92 dbg_print("\nintr_sync: %s: done sys_sync on work[%d]-%ld\n",
93 __func__
, sync_work
->id
, sync_work
->version
);
95 dbg_print("\nintr_sync: %s: cancel,no_wait on work[%d]-%ld\n",
96 __func__
, sync_work
->id
, sync_work
->version
);
99 spin_lock(&sync_work
->lock
);
100 sync_work
->version
++;
101 sync_work
->ret
= ret
;
102 sync_work
->state
= INTR_SYNC_STATE_IDLE
;
103 complete_all(&sync_work
->done
);
104 spin_unlock(&sync_work
->lock
);
107 /* wakeup functions that depend on PM facilities
109 * struct intr_wakeup_data : wrapper structure for variables for PM
110 * each thread has own instance of it
111 * __prepare_wakeup_event() : prepare and check intr_wakeup_data
112 * __check_wakeup_event() : check wakeup-event with intr_wakeup_data
114 struct intr_wakeup_data
{
118 static inline int __prepare_wakeup_event(struct intr_wakeup_data
*wd
)
120 if (pm_get_wakeup_count(&wd
->cnt
, false))
123 pr_info("intr_sync: detected wakeup events before sync\n");
124 pm_print_active_wakeup_sources();
128 static inline int __check_wakeup_event(struct intr_wakeup_data
*wd
)
130 unsigned int cnt
, no_inpr
;
132 no_inpr
= pm_get_wakeup_count(&cnt
, false);
133 if (no_inpr
&& (cnt
== wd
->cnt
))
136 pr_info("intr_sync: detected wakeup events(no_inpr: %u cnt: %u->%u)\n",
137 no_inpr
, wd
->cnt
, cnt
);
138 pm_print_active_wakeup_sources();
142 /* Interruptible Sync
144 * intr_sync() is same function as sys_sync() except that it can wakeup.
145 * It's possible because of inter_syncd workqueue.
147 * If system gets wakeup event while sync_work is running,
148 * just return -EBUSY, otherwise 0.
150 * If intr_sync() is called again while sync_work is running, it will enqueue
151 * idle sync_work to work_queue and wait the completion of it.
152 * If there is not idle sync_work but queued one, it just increases waiter by 1,
153 * and waits the completion of queued sync_work.
155 * If you want to know returned value of sys_sync(),
156 * you can get it from the argument, sync_ret
159 int intr_sync(int *sync_ret
)
163 /* If the workqueue exists, try to enqueue work and wait */
164 if (likely(intr_sync_wq
)) {
165 struct interruptible_sync_work
*sync_work
;
166 struct intr_wakeup_data wd
;
171 work_idx
= !atomic_read(&running_work_idx
);
172 sync_work
= &intr_sync_work
[work_idx
];
174 /* Prepare intr_wakeup_data and check wakeup event:
175 * If a wakeup-event is detected, wake up right now
177 if (__prepare_wakeup_event(&wd
)) {
178 dbg_print("intr_sync: detect wakeup event "
179 "before waiting work[%d]\n", work_idx
);
183 dbg_print("\nintr_sync: try to wait work[%d]\n", work_idx
);
185 spin_lock(&sync_work
->lock
);
186 work_ver
= sync_work
->version
;
187 if (sync_work
->state
== INTR_SYNC_STATE_RUNNING
) {
188 spin_unlock(&sync_work
->lock
);
189 dbg_print("intr_sync: work[%d] is already running, "
190 "find idle work\n", work_idx
);
195 if (sync_work
->state
== INTR_SYNC_STATE_IDLE
) {
196 dbg_print("intr_sync: enqueue work[%d]\n", work_idx
);
197 sync_work
->state
= INTR_SYNC_STATE_QUEUED
;
198 INIT_COMPLETION(sync_work
->done
);
199 queue_work(intr_sync_wq
, &sync_work
->work
);
201 spin_unlock(&sync_work
->lock
);
204 /* Check wakeup event first before waiting:
205 * If a wakeup-event is detected, wake up right now
207 if (__check_wakeup_event(&wd
)) {
208 spin_lock(&sync_work
->lock
);
210 spin_unlock(&sync_work
->lock
);
211 dbg_print("intr_sync: detect wakeup event "
212 "while waiting work[%d]\n", work_idx
);
216 // dbg_print("intr_sync: waiting work[%d]\n", work_idx);
217 /* Return 0 if timed out, or positive if completed. */
218 ret
= wait_for_completion_io_timeout(
219 &sync_work
->done
, HZ
/10);
220 /* A work that we are waiting for has done. */
221 if ((ret
> 0) || (sync_work
->version
!= work_ver
))
223 // dbg_print("intr_sync: timeout work[%d]\n", work_idx);
226 spin_lock(&sync_work
->lock
);
229 *sync_ret
= sync_work
->ret
;
230 spin_unlock(&sync_work
->lock
);
231 dbg_print("intr_sync: sync work[%d] is done with ret(%d)\n",
232 work_idx
, sync_work
->ret
);
236 /* check whether a workqueue exists or not under locked state.
237 * Create new one if a workqueue is not created yet.
239 mutex_lock(&intr_sync_wq_lock
);
240 if (likely(!intr_sync_wq
)) {
241 intr_sync_work
[0].id
= 0;
242 intr_sync_work
[1].id
= 1;
243 INIT_WORK(&intr_sync_work
[0].work
, do_intr_sync
);
244 INIT_WORK(&intr_sync_work
[1].work
, do_intr_sync
);
245 spin_lock_init(&intr_sync_work
[0].lock
);
246 spin_lock_init(&intr_sync_work
[1].lock
);
247 init_completion(&intr_sync_work
[0].done
);
248 init_completion(&intr_sync_work
[1].done
);
249 intr_sync_wq
= alloc_ordered_workqueue("intr_syncd", WQ_MEM_RECLAIM
);
250 dbg_print("\nintr_sync: try to allocate intr_sync_queue\n");
252 mutex_unlock(&intr_sync_wq_lock
);
254 /* try to enqueue work again if the workqueue is created successfully */
255 if (likely(intr_sync_wq
))
256 goto enqueue_sync_wait
;
258 printk("\nintr_sync: allocation failed, just call sync()\n");
264 #else /* CONFIG_INTERRUPTIBLE_SYNC */
265 int intr_sync(int *sync_ret
)
267 int ret
= sys_sync();
272 #endif /* CONFIG_INTERRUPTIBLE_SYNC */
275 * Do the filesystem syncing work. For simple filesystems
276 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
277 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
278 * wait == 1 case since in that case write_inode() functions do
279 * sync_dirty_buffer() and thus effectively write one block at a time.
281 static int __sync_filesystem(struct super_block
*sb
, int wait
)
286 writeback_inodes_sb(sb
, WB_REASON_SYNC
);
288 if (sb
->s_op
->sync_fs
)
289 sb
->s_op
->sync_fs(sb
, wait
);
290 return __sync_blockdev(sb
->s_bdev
, wait
);
294 * Write out and wait upon all dirty data associated with this
295 * superblock. Filesystem data as well as the underlying block
296 * device. Takes the superblock lock.
298 int sync_filesystem(struct super_block
*sb
)
303 * We need to be protected against the filesystem going from
304 * r/o to r/w or vice versa.
306 WARN_ON(!rwsem_is_locked(&sb
->s_umount
));
309 * No point in syncing out anything if the filesystem is read-only.
311 if (sb
->s_flags
& MS_RDONLY
)
314 ret
= __sync_filesystem(sb
, 0);
317 return __sync_filesystem(sb
, 1);
319 EXPORT_SYMBOL_GPL(sync_filesystem
);
321 static void sync_inodes_one_sb(struct super_block
*sb
, void *arg
)
323 if (!(sb
->s_flags
& MS_RDONLY
))
327 static void sync_fs_one_sb(struct super_block
*sb
, void *arg
)
329 if (!(sb
->s_flags
& MS_RDONLY
) && sb
->s_op
->sync_fs
)
330 sb
->s_op
->sync_fs(sb
, *(int *)arg
);
333 static void fdatawrite_one_bdev(struct block_device
*bdev
, void *arg
)
335 filemap_fdatawrite(bdev
->bd_inode
->i_mapping
);
338 static void fdatawait_one_bdev(struct block_device
*bdev
, void *arg
)
340 filemap_fdatawait(bdev
->bd_inode
->i_mapping
);
344 * Sync everything. We start by waking flusher threads so that most of
345 * writeback runs on all devices in parallel. Then we sync all inodes reliably
346 * which effectively also waits for all flusher threads to finish doing
347 * writeback. At this point all data is on disk so metadata should be stable
348 * and we tell filesystems to sync their metadata via ->sync_fs() calls.
349 * Finally, we writeout all block devices because some filesystems (e.g. ext2)
350 * just write metadata (such as inodes or bitmaps) to block device page cache
351 * and do not sync it on their own in ->sync_fs().
353 SYSCALL_DEFINE0(sync
)
355 int nowait
= 0, wait
= 1;
357 wakeup_flusher_threads(0, WB_REASON_SYNC
);
358 iterate_supers(sync_inodes_one_sb
, NULL
);
359 iterate_supers(sync_fs_one_sb
, &nowait
);
360 iterate_supers(sync_fs_one_sb
, &wait
);
361 iterate_bdevs(fdatawrite_one_bdev
, NULL
);
362 iterate_bdevs(fdatawait_one_bdev
, NULL
);
363 if (unlikely(laptop_mode
))
364 laptop_sync_completion();
368 static void do_sync_work(struct work_struct
*work
)
373 * Sync twice to reduce the possibility we skipped some inodes / pages
374 * because they were temporarily locked
376 iterate_supers(sync_inodes_one_sb
, &nowait
);
377 iterate_supers(sync_fs_one_sb
, &nowait
);
378 iterate_bdevs(fdatawrite_one_bdev
, NULL
);
379 iterate_supers(sync_inodes_one_sb
, &nowait
);
380 iterate_supers(sync_fs_one_sb
, &nowait
);
381 iterate_bdevs(fdatawrite_one_bdev
, NULL
);
382 printk("Emergency Sync complete\n");
386 void emergency_sync(void)
388 struct work_struct
*work
;
390 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
392 INIT_WORK(work
, do_sync_work
);
398 * sync a single super
400 SYSCALL_DEFINE1(syncfs
, int, fd
)
402 struct fd f
= fdget(fd
);
403 struct super_block
*sb
;
408 sb
= f
.file
->f_dentry
->d_sb
;
410 down_read(&sb
->s_umount
);
411 ret
= sync_filesystem(sb
);
412 up_read(&sb
->s_umount
);
419 * vfs_fsync_range - helper to sync a range of data & metadata to disk
420 * @file: file to sync
421 * @start: offset in bytes of the beginning of data range to sync
422 * @end: offset in bytes of the end of data range (inclusive)
423 * @datasync: perform only datasync
425 * Write back data in range @start..@end and metadata for @file to disk. If
426 * @datasync is set only metadata needed to access modified file data is
429 int vfs_fsync_range(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
431 if (!file
->f_op
|| !file
->f_op
->fsync
)
433 return file
->f_op
->fsync(file
, start
, end
, datasync
);
435 EXPORT_SYMBOL(vfs_fsync_range
);
438 * vfs_fsync - perform a fsync or fdatasync on a file
439 * @file: file to sync
440 * @datasync: only perform a fdatasync operation
442 * Write back data and metadata for @file to disk. If @datasync is
443 * set only metadata needed to access modified file data is written.
445 int vfs_fsync(struct file
*file
, int datasync
)
447 return vfs_fsync_range(file
, 0, LLONG_MAX
, datasync
);
449 EXPORT_SYMBOL(vfs_fsync
);
451 static int do_fsync(unsigned int fd
, int datasync
)
453 struct fd f
= fdget(fd
);
457 ret
= vfs_fsync(f
.file
, datasync
);
464 SYSCALL_DEFINE1(fsync
, unsigned int, fd
)
466 return do_fsync(fd
, 0);
469 SYSCALL_DEFINE1(fdatasync
, unsigned int, fd
)
471 return do_fsync(fd
, 1);
475 * generic_write_sync - perform syncing after a write if file / inode is sync
476 * @file: file to which the write happened
477 * @pos: offset where the write started
478 * @count: length of the write
480 * This is just a simple wrapper about our general syncing function.
482 int generic_write_sync(struct file
*file
, loff_t pos
, loff_t count
)
484 if (!(file
->f_flags
& O_DSYNC
) && !IS_SYNC(file
->f_mapping
->host
))
486 return vfs_fsync_range(file
, pos
, pos
+ count
- 1,
487 (file
->f_flags
& __O_SYNC
) ? 0 : 1);
489 EXPORT_SYMBOL(generic_write_sync
);
492 * sys_sync_file_range() permits finely controlled syncing over a segment of
493 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
494 * zero then sys_sync_file_range() will operate from offset out to EOF.
498 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
499 * before performing the write.
501 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
502 * range which are not presently under writeback. Note that this may block for
503 * significant periods due to exhaustion of disk request structures.
505 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
506 * after performing the write.
508 * Useful combinations of the flag bits are:
510 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
511 * in the range which were dirty on entry to sys_sync_file_range() are placed
512 * under writeout. This is a start-write-for-data-integrity operation.
514 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
515 * are not presently under writeout. This is an asynchronous flush-to-disk
516 * operation. Not suitable for data integrity operations.
518 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
519 * completion of writeout of all pages in the range. This will be used after an
520 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
521 * for that operation to complete and to return the result.
523 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
524 * a traditional sync() operation. This is a write-for-data-integrity operation
525 * which will ensure that all pages in the range which were dirty on entry to
526 * sys_sync_file_range() are committed to disk.
529 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
530 * I/O errors or ENOSPC conditions and will return those to the caller, after
531 * clearing the EIO and ENOSPC flags in the address_space.
533 * It should be noted that none of these operations write out the file's
534 * metadata. So unless the application is strictly performing overwrites of
535 * already-instantiated disk blocks, there are no guarantees here that the data
536 * will be available after a crash.
538 SYSCALL_DEFINE4(sync_file_range
, int, fd
, loff_t
, offset
, loff_t
, nbytes
,
543 struct address_space
*mapping
;
544 loff_t endbyte
; /* inclusive */
548 if (flags
& ~VALID_FLAGS
)
551 endbyte
= offset
+ nbytes
;
555 if ((s64
)endbyte
< 0)
557 if (endbyte
< offset
)
560 if (sizeof(pgoff_t
) == 4) {
561 if (offset
>= (0x100000000ULL
<< PAGE_CACHE_SHIFT
)) {
563 * The range starts outside a 32 bit machine's
564 * pagecache addressing capabilities. Let it "succeed"
569 if (endbyte
>= (0x100000000ULL
<< PAGE_CACHE_SHIFT
)) {
580 endbyte
--; /* inclusive */
587 i_mode
= file_inode(f
.file
)->i_mode
;
589 if (!S_ISREG(i_mode
) && !S_ISBLK(i_mode
) && !S_ISDIR(i_mode
) &&
593 mapping
= f
.file
->f_mapping
;
600 if (flags
& SYNC_FILE_RANGE_WAIT_BEFORE
) {
601 ret
= filemap_fdatawait_range(mapping
, offset
, endbyte
);
606 if (flags
& SYNC_FILE_RANGE_WRITE
) {
607 ret
= filemap_fdatawrite_range(mapping
, offset
, endbyte
);
612 if (flags
& SYNC_FILE_RANGE_WAIT_AFTER
)
613 ret
= filemap_fdatawait_range(mapping
, offset
, endbyte
);
621 /* It would be nice if people remember that not all the world's an i386
622 when they introduce new system calls */
623 SYSCALL_DEFINE4(sync_file_range2
, int, fd
, unsigned int, flags
,
624 loff_t
, offset
, loff_t
, nbytes
)
626 return sys_sync_file_range(fd
, offset
, nbytes
, flags
);