fs/sync.c

   1 /*
   2  * High-level sync()-related operations
   3  */
   4
   5 #include <linux/kernel.h>
   6 #include <linux/file.h>
   7 #include <linux/fs.h>
   8 #include <linux/slab.h>
   9 #include <linux/export.h>
  10 #include <linux/namei.h>
  11 #include <linux/sched.h>
  12 #include <linux/writeback.h>
  13 #include <linux/syscalls.h>
  14 #include <linux/linkage.h>
  15 #include <linux/pagemap.h>
  16 #include <linux/quotaops.h>
  17 #include <linux/backing-dev.h>
  18 #include "internal.h"
  19
  20 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
  21                         SYNC_FILE_RANGE_WAIT_AFTER)
  22
  23 /* Interruptible sync for Samsung Mobile Device */
  24 #ifdef CONFIG_INTERRUPTIBLE_SYNC
  25
  26 #include <linux/workqueue.h>
  27 #include <linux/suspend.h>
  28 #include <linux/delay.h>
  29
  30 //#define CONFIG_INTR_SYNC_DEBUG
  31
  32 #ifdef CONFIG_INTR_SYNC_DEBUG
  33 #define dbg_print       printk
  34 #else
  35 #define dbg_print(...)
  36 #endif
  37
  38 enum {
  39         INTR_SYNC_STATE_IDLE = 0,
  40         INTR_SYNC_STATE_QUEUED,
  41         INTR_SYNC_STATE_RUNNING,
  42         INTR_SYNC_STATE_MAX
  43 };
  44
  45 struct interruptible_sync_work {
  46         int id;
  47         int ret;
  48         unsigned int waiter;
  49         unsigned int state;
  50         unsigned long version;
  51         spinlock_t lock;
  52         struct completion done;
  53         struct work_struct work;
  54 };
  55
  56 /* Initially, intr_sync_work has zero pending */
  57 static struct interruptible_sync_work intr_sync_work[2];
  58
  59 /* Last work start time */
  60 static atomic_t running_work_idx;
  61
  62 /* intr_sync_wq will be created when intr_sync() is called at first time.
  63  * And it is alive till system shutdown */
  64 static struct workqueue_struct *intr_sync_wq;
  65
  66 /* It prevents double allocation of intr_sync_wq */
  67 static DEFINE_MUTEX(intr_sync_wq_lock);
  68
  69 static inline struct interruptible_sync_work *INTR_SYNC_WORK(struct work_struct *work)
  70 {
  71         return container_of(work, struct interruptible_sync_work, work);
  72 }
  73
  74 static void do_intr_sync(struct work_struct *work)
  75 {
  76         struct interruptible_sync_work *sync_work = INTR_SYNC_WORK(work);
  77         int ret = 0;
  78         unsigned int waiter;
  79
  80         spin_lock(&sync_work->lock);
  81         atomic_set(&running_work_idx, sync_work->id);
  82         sync_work->state = INTR_SYNC_STATE_RUNNING;
  83         waiter = sync_work->waiter;
  84         spin_unlock(&sync_work->lock);
  85
  86         dbg_print("\nintr_sync: %s: call sys_sync on work[%d]-%ld\n",
  87                         __func__, sync_work->id, sync_work->version);
  88
  89         /* if no one waits, do not call sync() */
  90         if (waiter) {
  91                 ret = sys_sync();
  92                 dbg_print("\nintr_sync: %s: done sys_sync on work[%d]-%ld\n",
  93                         __func__, sync_work->id, sync_work->version);
  94         } else {
  95                 dbg_print("\nintr_sync: %s: cancel,no_wait on work[%d]-%ld\n",
  96                         __func__, sync_work->id, sync_work->version);
  97         }
  98
  99         spin_lock(&sync_work->lock);
 100         sync_work->version++;
 101         sync_work->ret = ret;
 102         sync_work->state = INTR_SYNC_STATE_IDLE;
 103         complete_all(&sync_work->done);
 104         spin_unlock(&sync_work->lock);
 105 }
 106
 107 /* wakeup functions that depend on PM facilities
 108  *
 109  * struct intr_wakeup_data  : wrapper structure for variables for PM
 110  *                            each thread has own instance of it
 111  * __prepare_wakeup_event() : prepare and check intr_wakeup_data
 112  * __check_wakeup_event()   : check wakeup-event with intr_wakeup_data
 113  */
 114 struct intr_wakeup_data {
 115         unsigned int cnt;
 116 };
 117
 118 static inline int __prepare_wakeup_event(struct intr_wakeup_data *wd)
 119 {
 120         if (pm_get_wakeup_count(&wd->cnt, false))
 121                 return 0;
 122
 123         pr_info("intr_sync: detected wakeup events before sync\n");
 124         pm_print_active_wakeup_sources();
 125         return -EBUSY;
 126 }
 127
 128 static inline  int __check_wakeup_event(struct intr_wakeup_data *wd)
 129 {
 130         unsigned int cnt, no_inpr;
 131
 132         no_inpr = pm_get_wakeup_count(&cnt, false);
 133         if (no_inpr && (cnt == wd->cnt))
 134                 return 0;
 135
 136         pr_info("intr_sync: detected wakeup events(no_inpr: %u cnt: %u->%u)\n",
 137                 no_inpr, wd->cnt, cnt);
 138         pm_print_active_wakeup_sources();
 139         return -EBUSY;
 140 }
 141
 142 /* Interruptible Sync
 143  *
 144  * intr_sync() is same function as sys_sync() except that it can wakeup.
 145  * It's possible because of inter_syncd workqueue.
 146  *
 147  * If system gets wakeup event while sync_work is running,
 148  * just return -EBUSY, otherwise 0.
 149  *
 150  * If intr_sync() is called again while sync_work is running, it will enqueue
 151  * idle sync_work to work_queue and wait the completion of it.
 152  * If there is not idle sync_work but queued one, it just increases waiter by 1,
 153  * and waits the completion of queued sync_work.
 154  *
 155  * If you want to know returned value of sys_sync(),
 156  * you can get it from the argument, sync_ret
 157  */
 158
 159 int intr_sync(int *sync_ret)
 160 {
 161         int ret;
 162 enqueue_sync_wait:
 163         /* If the workqueue exists, try to enqueue work and wait */
 164         if (likely(intr_sync_wq)) {
 165                 struct interruptible_sync_work *sync_work;
 166                 struct intr_wakeup_data wd;
 167                 int work_idx;
 168                 int work_ver;
 169
 170 find_idle:
 171                 work_idx = !atomic_read(&running_work_idx);
 172                 sync_work = &intr_sync_work[work_idx];
 173
 174                 /* Prepare intr_wakeup_data and check wakeup event:
 175                  * If a wakeup-event is detected, wake up right now
 176                  */
 177                 if (__prepare_wakeup_event(&wd)) {
 178                         dbg_print("intr_sync: detect wakeup event "
 179                                 "before waiting work[%d]\n", work_idx);
 180                         return -EBUSY;
 181                 }
 182
 183                 dbg_print("\nintr_sync: try to wait work[%d]\n", work_idx);
 184
 185                 spin_lock(&sync_work->lock);
 186                 work_ver = sync_work->version;
 187                 if (sync_work->state == INTR_SYNC_STATE_RUNNING) {
 188                         spin_unlock(&sync_work->lock);
 189                         dbg_print("intr_sync: work[%d] is already running, "
 190                                 "find idle work\n", work_idx);
 191                         goto find_idle;
 192                 }
 193
 194                 sync_work->waiter++;
 195                 if (sync_work->state == INTR_SYNC_STATE_IDLE) {
 196                         dbg_print("intr_sync: enqueue work[%d]\n", work_idx);
 197                         sync_work->state = INTR_SYNC_STATE_QUEUED;
 198                         INIT_COMPLETION(sync_work->done);
 199                         queue_work(intr_sync_wq, &sync_work->work);
 200                 }
 201                 spin_unlock(&sync_work->lock);
 202
 203                 do {
 204                         /* Check wakeup event first before waiting:
 205                          * If a wakeup-event is detected, wake up right now
 206                          */
 207                         if  (__check_wakeup_event(&wd)) {
 208                                 spin_lock(&sync_work->lock);
 209                                 sync_work->waiter--;
 210                                 spin_unlock(&sync_work->lock);
 211                                 dbg_print("intr_sync: detect wakeup event "
 212                                         "while waiting work[%d]\n", work_idx);
 213                                 return -EBUSY;
 214                         }
 215
 216 //                      dbg_print("intr_sync: waiting work[%d]\n", work_idx);
 217                         /* Return 0 if timed out, or positive if completed. */
 218                         ret = wait_for_completion_io_timeout(
 219                                         &sync_work->done, HZ/10);
 220                         /* A work that we are waiting for has done. */
 221                         if ((ret > 0) || (sync_work->version != work_ver))
 222                                 break;
 223 //                      dbg_print("intr_sync: timeout work[%d]\n", work_idx);
 224                 } while (1);
 225
 226                 spin_lock(&sync_work->lock);
 227                 sync_work->waiter--;
 228                 if (sync_ret)
 229                         *sync_ret = sync_work->ret;
 230                 spin_unlock(&sync_work->lock);
 231                 dbg_print("intr_sync: sync work[%d] is done with ret(%d)\n",
 232                                 work_idx, sync_work->ret);
 233                 return 0;
 234         }
 235
 236         /* check whether a workqueue exists or not under locked state.
 237          * Create new one if a workqueue is not created yet.
 238          */
 239         mutex_lock(&intr_sync_wq_lock);
 240         if (likely(!intr_sync_wq)) {
 241                 intr_sync_work[0].id = 0;
 242                 intr_sync_work[1].id = 1;
 243                 INIT_WORK(&intr_sync_work[0].work, do_intr_sync);
 244                 INIT_WORK(&intr_sync_work[1].work, do_intr_sync);
 245                 spin_lock_init(&intr_sync_work[0].lock);
 246                 spin_lock_init(&intr_sync_work[1].lock);
 247                 init_completion(&intr_sync_work[0].done);
 248                 init_completion(&intr_sync_work[1].done);
 249                 intr_sync_wq = alloc_ordered_workqueue("intr_syncd", WQ_MEM_RECLAIM);
 250                 dbg_print("\nintr_sync: try to allocate intr_sync_queue\n");
 251         }
 252         mutex_unlock(&intr_sync_wq_lock);
 253
 254         /* try to enqueue work again if the workqueue is created successfully */
 255         if (likely(intr_sync_wq))
 256                 goto enqueue_sync_wait;
 257
 258         printk("\nintr_sync: allocation failed, just call sync()\n");
 259         ret = sys_sync();
 260         if (sync_ret)
 261                 *sync_ret = ret;
 262         return 0;
 263 }
 264 #else /* CONFIG_INTERRUPTIBLE_SYNC */
 265 int intr_sync(int *sync_ret)
 266 {
 267         int ret = sys_sync();
 268         if (sync_ret)
 269                 *sync_ret = ret;
 270         return 0;
 271 }
 272 #endif /* CONFIG_INTERRUPTIBLE_SYNC */
 273
 274 /*
 275  * Do the filesystem syncing work. For simple filesystems
 276  * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
 277  * submit IO for these buffers via __sync_blockdev(). This also speeds up the
 278  * wait == 1 case since in that case write_inode() functions do
 279  * sync_dirty_buffer() and thus effectively write one block at a time.
 280  */
 281 static int __sync_filesystem(struct super_block *sb, int wait)
 282 {
 283         if (wait)
 284                 sync_inodes_sb(sb);
 285         else
 286                 writeback_inodes_sb(sb, WB_REASON_SYNC);
 287
 288         if (sb->s_op->sync_fs)
 289                 sb->s_op->sync_fs(sb, wait);
 290         return __sync_blockdev(sb->s_bdev, wait);
 291 }
 292
 293 /*
 294  * Write out and wait upon all dirty data associated with this
 295  * superblock.  Filesystem data as well as the underlying block
 296  * device.  Takes the superblock lock.
 297  */
 298 int sync_filesystem(struct super_block *sb)
 299 {
 300         int ret;
 301
 302         /*
 303          * We need to be protected against the filesystem going from
 304          * r/o to r/w or vice versa.
 305          */
 306         WARN_ON(!rwsem_is_locked(&sb->s_umount));
 307
 308         /*
 309          * No point in syncing out anything if the filesystem is read-only.
 310          */
 311         if (sb->s_flags & MS_RDONLY)
 312                 return 0;
 313
 314         ret = __sync_filesystem(sb, 0);
 315         if (ret < 0)
 316                 return ret;
 317         return __sync_filesystem(sb, 1);
 318 }
 319 EXPORT_SYMBOL_GPL(sync_filesystem);
 320
 321 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 322 {
 323         if (!(sb->s_flags & MS_RDONLY))
 324                 sync_inodes_sb(sb);
 325 }
 326
 327 static void sync_fs_one_sb(struct super_block *sb, void *arg)
 328 {
 329         if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs)
 330                 sb->s_op->sync_fs(sb, *(int *)arg);
 331 }
 332
 333 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
 334 {
 335         filemap_fdatawrite(bdev->bd_inode->i_mapping);
 336 }
 337
 338 static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 339 {
 340         filemap_fdatawait(bdev->bd_inode->i_mapping);
 341 }
 342
 343 /*
 344  * Sync everything. We start by waking flusher threads so that most of
 345  * writeback runs on all devices in parallel. Then we sync all inodes reliably
 346  * which effectively also waits for all flusher threads to finish doing
 347  * writeback. At this point all data is on disk so metadata should be stable
 348  * and we tell filesystems to sync their metadata via ->sync_fs() calls.
 349  * Finally, we writeout all block devices because some filesystems (e.g. ext2)
 350  * just write metadata (such as inodes or bitmaps) to block device page cache
 351  * and do not sync it on their own in ->sync_fs().
 352  */
 353 SYSCALL_DEFINE0(sync)
 354 {
 355         int nowait = 0, wait = 1;
 356
 357         wakeup_flusher_threads(0, WB_REASON_SYNC);
 358         iterate_supers(sync_inodes_one_sb, NULL);
 359         iterate_supers(sync_fs_one_sb, &nowait);
 360         iterate_supers(sync_fs_one_sb, &wait);
 361         iterate_bdevs(fdatawrite_one_bdev, NULL);
 362         iterate_bdevs(fdatawait_one_bdev, NULL);
 363         if (unlikely(laptop_mode))
 364                 laptop_sync_completion();
 365         return 0;
 366 }
 367
 368 static void do_sync_work(struct work_struct *work)
 369 {
 370         int nowait = 0;
 371
 372         /*
 373          * Sync twice to reduce the possibility we skipped some inodes / pages
 374          * because they were temporarily locked
 375          */
 376         iterate_supers(sync_inodes_one_sb, &nowait);
 377         iterate_supers(sync_fs_one_sb, &nowait);
 378         iterate_bdevs(fdatawrite_one_bdev, NULL);
 379         iterate_supers(sync_inodes_one_sb, &nowait);
 380         iterate_supers(sync_fs_one_sb, &nowait);
 381         iterate_bdevs(fdatawrite_one_bdev, NULL);
 382         printk("Emergency Sync complete\n");
 383         kfree(work);
 384 }
 385
 386 void emergency_sync(void)
 387 {
 388         struct work_struct *work;
 389
 390         work = kmalloc(sizeof(*work), GFP_ATOMIC);
 391         if (work) {
 392                 INIT_WORK(work, do_sync_work);
 393                 schedule_work(work);
 394         }
 395 }
 396
 397 /*
 398  * sync a single super
 399  */
 400 SYSCALL_DEFINE1(syncfs, int, fd)
 401 {
 402         struct fd f = fdget(fd);
 403         struct super_block *sb;
 404         int ret;
 405
 406         if (!f.file)
 407                 return -EBADF;
 408         sb = f.file->f_dentry->d_sb;
 409
 410         down_read(&sb->s_umount);
 411         ret = sync_filesystem(sb);
 412         up_read(&sb->s_umount);
 413
 414         fdput(f);
 415         return ret;
 416 }
 417
 418 /**
 419  * vfs_fsync_range - helper to sync a range of data & metadata to disk
 420  * @file:               file to sync
 421  * @start:              offset in bytes of the beginning of data range to sync
 422  * @end:                offset in bytes of the end of data range (inclusive)
 423  * @datasync:           perform only datasync
 424  *
 425  * Write back data in range @start..@end and metadata for @file to disk.  If
 426  * @datasync is set only metadata needed to access modified file data is
 427  * written.
 428  */
 429 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 430 {
 431         if (!file->f_op || !file->f_op->fsync)
 432                 return -EINVAL;
 433         return file->f_op->fsync(file, start, end, datasync);
 434 }
 435 EXPORT_SYMBOL(vfs_fsync_range);
 436
 437 /**
 438  * vfs_fsync - perform a fsync or fdatasync on a file
 439  * @file:               file to sync
 440  * @datasync:           only perform a fdatasync operation
 441  *
 442  * Write back data and metadata for @file to disk.  If @datasync is
 443  * set only metadata needed to access modified file data is written.
 444  */
 445 int vfs_fsync(struct file *file, int datasync)
 446 {
 447         return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
 448 }
 449 EXPORT_SYMBOL(vfs_fsync);
 450
 451 static int do_fsync(unsigned int fd, int datasync)
 452 {
 453         struct fd f = fdget(fd);
 454         int ret = -EBADF;
 455
 456         if (f.file) {
 457                 ret = vfs_fsync(f.file, datasync);
 458                 fdput(f);
 459                 inc_syscfs(current);
 460         }
 461         return ret;
 462 }
 463
 464 SYSCALL_DEFINE1(fsync, unsigned int, fd)
 465 {
 466         return do_fsync(fd, 0);
 467 }
 468
 469 SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 470 {
 471         return do_fsync(fd, 1);
 472 }
 473
 474 /**
 475  * generic_write_sync - perform syncing after a write if file / inode is sync
 476  * @file:       file to which the write happened
 477  * @pos:        offset where the write started
 478  * @count:      length of the write
 479  *
 480  * This is just a simple wrapper about our general syncing function.
 481  */
 482 int generic_write_sync(struct file *file, loff_t pos, loff_t count)
 483 {
 484         if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
 485                 return 0;
 486         return vfs_fsync_range(file, pos, pos + count - 1,
 487                                (file->f_flags & __O_SYNC) ? 0 : 1);
 488 }
 489 EXPORT_SYMBOL(generic_write_sync);
 490
 491 /*
 492  * sys_sync_file_range() permits finely controlled syncing over a segment of
 493  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
 494  * zero then sys_sync_file_range() will operate from offset out to EOF.
 495  *
 496  * The flag bits are:
 497  *
 498  * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
 499  * before performing the write.
 500  *
 501  * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
 502  * range which are not presently under writeback. Note that this may block for
 503  * significant periods due to exhaustion of disk request structures.
 504  *
 505  * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 506  * after performing the write.
 507  *
 508  * Useful combinations of the flag bits are:
 509  *
 510  * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
 511  * in the range which were dirty on entry to sys_sync_file_range() are placed
 512  * under writeout.  This is a start-write-for-data-integrity operation.
 513  *
 514  * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
 515  * are not presently under writeout.  This is an asynchronous flush-to-disk
 516  * operation.  Not suitable for data integrity operations.
 517  *
 518  * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
 519  * completion of writeout of all pages in the range.  This will be used after an
 520  * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
 521  * for that operation to complete and to return the result.
 522  *
 523  * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
 524  * a traditional sync() operation.  This is a write-for-data-integrity operation
 525  * which will ensure that all pages in the range which were dirty on entry to
 526  * sys_sync_file_range() are committed to disk.
 527  *
 528  *
 529  * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
 530  * I/O errors or ENOSPC conditions and will return those to the caller, after
 531  * clearing the EIO and ENOSPC flags in the address_space.
 532  *
 533  * It should be noted that none of these operations write out the file's
 534  * metadata.  So unless the application is strictly performing overwrites of
 535  * already-instantiated disk blocks, there are no guarantees here that the data
 536  * will be available after a crash.
 537  */
 538 SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 539                                 unsigned int, flags)
 540 {
 541         int ret;
 542         struct fd f;
 543         struct address_space *mapping;
 544         loff_t endbyte;                 /* inclusive */
 545         umode_t i_mode;
 546
 547         ret = -EINVAL;
 548         if (flags & ~VALID_FLAGS)
 549                 goto out;
 550
 551         endbyte = offset + nbytes;
 552
 553         if ((s64)offset < 0)
 554                 goto out;
 555         if ((s64)endbyte < 0)
 556                 goto out;
 557         if (endbyte < offset)
 558                 goto out;
 559
 560         if (sizeof(pgoff_t) == 4) {
 561                 if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
 562                         /*
 563                          * The range starts outside a 32 bit machine's
 564                          * pagecache addressing capabilities.  Let it "succeed"
 565                          */
 566                         ret = 0;
 567                         goto out;
 568                 }
 569                 if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
 570                         /*
 571                          * Out to EOF
 572                          */
 573                         nbytes = 0;
 574                 }
 575         }
 576
 577         if (nbytes == 0)
 578                 endbyte = LLONG_MAX;
 579         else
 580                 endbyte--;              /* inclusive */
 581
 582         ret = -EBADF;
 583         f = fdget(fd);
 584         if (!f.file)
 585                 goto out;
 586
 587         i_mode = file_inode(f.file)->i_mode;
 588         ret = -ESPIPE;
 589         if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
 590                         !S_ISLNK(i_mode))
 591                 goto out_put;
 592
 593         mapping = f.file->f_mapping;
 594         if (!mapping) {
 595                 ret = -EINVAL;
 596                 goto out_put;
 597         }
 598
 599         ret = 0;
 600         if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
 601                 ret = filemap_fdatawait_range(mapping, offset, endbyte);
 602                 if (ret < 0)
 603                         goto out_put;
 604         }
 605
 606         if (flags & SYNC_FILE_RANGE_WRITE) {
 607                 ret = filemap_fdatawrite_range(mapping, offset, endbyte);
 608                 if (ret < 0)
 609                         goto out_put;
 610         }
 611
 612         if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
 613                 ret = filemap_fdatawait_range(mapping, offset, endbyte);
 614
 615 out_put:
 616         fdput(f);
 617 out:
 618         return ret;
 619 }
 620
 621 /* It would be nice if people remember that not all the world's an i386
 622    when they introduce new system calls */
 623 SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
 624                                  loff_t, offset, loff_t, nbytes)
 625 {
 626         return sys_sync_file_range(fd, offset, nbytes, flags);
 627 }