[GitHub/LineageOS/android_kernel_samsung_universal7580.git] / fs / sync.c

/*
 * High-level sync()-related operations
 */

#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/backing-dev.h>
#include "internal.h"

#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
			SYNC_FILE_RANGE_WAIT_AFTER)

/* Interruptible sync for Samsung Mobile Device */
#ifdef CONFIG_INTERRUPTIBLE_SYNC

#include <linux/workqueue.h>
#include <linux/suspend.h>
#include <linux/delay.h>

//#define CONFIG_INTR_SYNC_DEBUG

#ifdef CONFIG_INTR_SYNC_DEBUG
#define dbg_print	printk
#else
#define dbg_print(...)
#endif

enum {
	INTR_SYNC_STATE_IDLE = 0,
	INTR_SYNC_STATE_QUEUED,
	INTR_SYNC_STATE_RUNNING,
	INTR_SYNC_STATE_MAX
};

struct interruptible_sync_work {
	int id;
	int ret;
	unsigned int waiter;
	unsigned int state;
	unsigned long version;
	spinlock_t lock;
	struct completion done;
	struct work_struct work;
};

/* Initially, intr_sync_work has zero pending */
static struct interruptible_sync_work intr_sync_work[2];

/* Last work start time */
static atomic_t running_work_idx;

/* intr_sync_wq will be created when intr_sync() is called at first time.
 * And it is alive till system shutdown */
static struct workqueue_struct *intr_sync_wq;

/* It prevents double allocation of intr_sync_wq */
static DEFINE_MUTEX(intr_sync_wq_lock);

static inline struct interruptible_sync_work *INTR_SYNC_WORK(struct work_struct *work) 
{
	return container_of(work, struct interruptible_sync_work, work);
}

static void do_intr_sync(struct work_struct *work)
{
	struct interruptible_sync_work *sync_work = INTR_SYNC_WORK(work);
	int ret = 0;
	unsigned int waiter;

	spin_lock(&sync_work->lock);
	atomic_set(&running_work_idx, sync_work->id);
	sync_work->state = INTR_SYNC_STATE_RUNNING;
	waiter = sync_work->waiter;
	spin_unlock(&sync_work->lock);

	dbg_print("\nintr_sync: %s: call sys_sync on work[%d]-%ld\n",
			__func__, sync_work->id, sync_work->version);

	/* if no one waits, do not call sync() */
	if (waiter) {
		ret = sys_sync();
		dbg_print("\nintr_sync: %s: done sys_sync on work[%d]-%ld\n",
			__func__, sync_work->id, sync_work->version);
	} else {
		dbg_print("\nintr_sync: %s: cancel,no_wait on work[%d]-%ld\n",
			__func__, sync_work->id, sync_work->version);
	}

	spin_lock(&sync_work->lock);
	sync_work->version++;
	sync_work->ret = ret;
	sync_work->state = INTR_SYNC_STATE_IDLE;
	complete_all(&sync_work->done);
	spin_unlock(&sync_work->lock);
}

/* wakeup functions that depend on PM facilities
 *
 * struct intr_wakeup_data  : wrapper structure for variables for PM
 *			      each thread has own instance of it
 * __prepare_wakeup_event() : prepare and check intr_wakeup_data
 * __check_wakeup_event()   : check wakeup-event with intr_wakeup_data
 */
struct intr_wakeup_data {
	unsigned int cnt;
};

static inline int __prepare_wakeup_event(struct intr_wakeup_data *wd)
{
	if (pm_get_wakeup_count(&wd->cnt, false))
		return 0;

	pr_info("intr_sync: detected wakeup events before sync\n");
	pm_print_active_wakeup_sources();
	return -EBUSY;
}

static inline  int __check_wakeup_event(struct intr_wakeup_data *wd)
{
	unsigned int cnt, no_inpr;

	no_inpr = pm_get_wakeup_count(&cnt, false);
	if (no_inpr && (cnt == wd->cnt))
		return 0;

	pr_info("intr_sync: detected wakeup events(no_inpr: %u cnt: %u->%u)\n",
		no_inpr, wd->cnt, cnt);
	pm_print_active_wakeup_sources();
	return -EBUSY;
}

/* Interruptible Sync
 *
 * intr_sync() is same function as sys_sync() except that it can wakeup.
 * It's possible because of inter_syncd workqueue.
 *
 * If system gets wakeup event while sync_work is running,
 * just return -EBUSY, otherwise 0.
 *
 * If intr_sync() is called again while sync_work is running, it will enqueue
 * idle sync_work to work_queue and wait the completion of it.
 * If there is not idle sync_work but queued one, it just increases waiter by 1,
 * and waits the completion of queued sync_work.
 *
 * If you want to know returned value of sys_sync(),
 * you can get it from the argument, sync_ret
 */

int intr_sync(int *sync_ret)
{
	int ret;
enqueue_sync_wait:
	/* If the workqueue exists, try to enqueue work and wait */
	if (likely(intr_sync_wq)) {
		struct interruptible_sync_work *sync_work;
		struct intr_wakeup_data wd;
		int work_idx;
		int work_ver;

find_idle:
		work_idx = !atomic_read(&running_work_idx);
		sync_work = &intr_sync_work[work_idx];

		/* Prepare intr_wakeup_data and check wakeup event:
		 * If a wakeup-event is detected, wake up right now
		 */
		if (__prepare_wakeup_event(&wd)) {
			dbg_print("intr_sync: detect wakeup event "
				"before waiting work[%d]\n", work_idx);
			return -EBUSY;
		}

		dbg_print("\nintr_sync: try to wait work[%d]\n", work_idx);

		spin_lock(&sync_work->lock);
		work_ver = sync_work->version;
		if (sync_work->state == INTR_SYNC_STATE_RUNNING) {
			spin_unlock(&sync_work->lock);
			dbg_print("intr_sync: work[%d] is already running, "
				"find idle work\n", work_idx);
			goto find_idle;
		}

		sync_work->waiter++;
		if (sync_work->state == INTR_SYNC_STATE_IDLE) {
			dbg_print("intr_sync: enqueue work[%d]\n", work_idx);
			sync_work->state = INTR_SYNC_STATE_QUEUED;
			INIT_COMPLETION(sync_work->done);
			queue_work(intr_sync_wq, &sync_work->work);
		}
		spin_unlock(&sync_work->lock);
		
		do {
			/* Check wakeup event first before waiting:
			 * If a wakeup-event is detected, wake up right now
			 */
			if  (__check_wakeup_event(&wd)) {
				spin_lock(&sync_work->lock);
				sync_work->waiter--;
				spin_unlock(&sync_work->lock);
				dbg_print("intr_sync: detect wakeup event "
					"while waiting work[%d]\n", work_idx);
				return -EBUSY;
			}

//			dbg_print("intr_sync: waiting work[%d]\n", work_idx);
			/* Return 0 if timed out, or positive if completed. */
			ret = wait_for_completion_io_timeout(
					&sync_work->done, HZ/10);
			/* A work that we are waiting for has done. */
			if ((ret > 0) || (sync_work->version != work_ver))
				break;
//			dbg_print("intr_sync: timeout work[%d]\n", work_idx);
		} while (1);

		spin_lock(&sync_work->lock);
		sync_work->waiter--;
		if (sync_ret)
			*sync_ret = sync_work->ret;
		spin_unlock(&sync_work->lock);
		dbg_print("intr_sync: sync work[%d] is done with ret(%d)\n",
				work_idx, sync_work->ret);
		return 0;
	}

	/* check whether a workqueue exists or not under locked state.
	 * Create new one if a workqueue is not created yet.
	 */
	mutex_lock(&intr_sync_wq_lock);
	if (likely(!intr_sync_wq)) {
		intr_sync_work[0].id = 0;
		intr_sync_work[1].id = 1;
		INIT_WORK(&intr_sync_work[0].work, do_intr_sync);
		INIT_WORK(&intr_sync_work[1].work, do_intr_sync);
		spin_lock_init(&intr_sync_work[0].lock);
		spin_lock_init(&intr_sync_work[1].lock);
		init_completion(&intr_sync_work[0].done);
		init_completion(&intr_sync_work[1].done);
		intr_sync_wq = alloc_ordered_workqueue("intr_syncd", WQ_MEM_RECLAIM);
		dbg_print("\nintr_sync: try to allocate intr_sync_queue\n");
	}
	mutex_unlock(&intr_sync_wq_lock);

	/* try to enqueue work again if the workqueue is created successfully */
	if (likely(intr_sync_wq))
		goto enqueue_sync_wait;

	printk("\nintr_sync: allocation failed, just call sync()\n");
	ret = sys_sync();
	if (sync_ret)
		*sync_ret = ret;
	return 0;
}
#else /* CONFIG_INTERRUPTIBLE_SYNC */
int intr_sync(int *sync_ret)
{
	int ret = sys_sync();
	if (sync_ret)
		*sync_ret = ret;
	return 0;
}
#endif /* CONFIG_INTERRUPTIBLE_SYNC */

/*
 * Do the filesystem syncing work. For simple filesystems
 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
 * wait == 1 case since in that case write_inode() functions do
 * sync_dirty_buffer() and thus effectively write one block at a time.
 */
static int __sync_filesystem(struct super_block *sb, int wait)
{
	if (wait)
		sync_inodes_sb(sb);
	else
		writeback_inodes_sb(sb, WB_REASON_SYNC);

	if (sb->s_op->sync_fs)
		sb->s_op->sync_fs(sb, wait);
	return __sync_blockdev(sb->s_bdev, wait);
}

/*
 * Write out and wait upon all dirty data associated with this
 * superblock.  Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int sync_filesystem(struct super_block *sb)
{
	int ret;

	/*
	 * We need to be protected against the filesystem going from
	 * r/o to r/w or vice versa.
	 */
	WARN_ON(!rwsem_is_locked(&sb->s_umount));

	/*
	 * No point in syncing out anything if the filesystem is read-only.
	 */
	if (sb->s_flags & MS_RDONLY)
		return 0;

	ret = __sync_filesystem(sb, 0);
	if (ret < 0)
		return ret;
	return __sync_filesystem(sb, 1);
}
EXPORT_SYMBOL_GPL(sync_filesystem);

static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
	if (!(sb->s_flags & MS_RDONLY))
		sync_inodes_sb(sb);
}

static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
	if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs)
		sb->s_op->sync_fs(sb, *(int *)arg);
}

static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
{
	filemap_fdatawrite(bdev->bd_inode->i_mapping);
}

static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
	filemap_fdatawait(bdev->bd_inode->i_mapping);
}

/*
 * Sync everything. We start by waking flusher threads so that most of
 * writeback runs on all devices in parallel. Then we sync all inodes reliably
 * which effectively also waits for all flusher threads to finish doing
 * writeback. At this point all data is on disk so metadata should be stable
 * and we tell filesystems to sync their metadata via ->sync_fs() calls.
 * Finally, we writeout all block devices because some filesystems (e.g. ext2)
 * just write metadata (such as inodes or bitmaps) to block device page cache
 * and do not sync it on their own in ->sync_fs().
 */
SYSCALL_DEFINE0(sync)
{
	int nowait = 0, wait = 1;

	wakeup_flusher_threads(0, WB_REASON_SYNC);
	iterate_supers(sync_inodes_one_sb, NULL);
	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &wait);
	iterate_bdevs(fdatawrite_one_bdev, NULL);
	iterate_bdevs(fdatawait_one_bdev, NULL);
	if (unlikely(laptop_mode))
		laptop_sync_completion();
	return 0;
}

static void do_sync_work(struct work_struct *work)
{
	int nowait = 0;

	/*
	 * Sync twice to reduce the possibility we skipped some inodes / pages
	 * because they were temporarily locked
	 */
	iterate_supers(sync_inodes_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_bdevs(fdatawrite_one_bdev, NULL);
	iterate_supers(sync_inodes_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_bdevs(fdatawrite_one_bdev, NULL);
	printk("Emergency Sync complete\n");
	kfree(work);
}

void emergency_sync(void)
{
	struct work_struct *work;

	work = kmalloc(sizeof(*work), GFP_ATOMIC);
	if (work) {
		INIT_WORK(work, do_sync_work);
		schedule_work(work);
	}
}

/*
 * sync a single super
 */
SYSCALL_DEFINE1(syncfs, int, fd)
{
	struct fd f = fdget(fd);
	struct super_block *sb;
	int ret;

	if (!f.file)
		return -EBADF;
	sb = f.file->f_dentry->d_sb;

	down_read(&sb->s_umount);
	ret = sync_filesystem(sb);
	up_read(&sb->s_umount);

	fdput(f);
	return ret;
}

/**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:		file to sync
 * @start:		offset in bytes of the beginning of data range to sync
 * @end:		offset in bytes of the end of data range (inclusive)
 * @datasync:		perform only datasync
 *
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
 */
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
	if (!file->f_op || !file->f_op->fsync)
		return -EINVAL;
	return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);

/**
 * vfs_fsync - perform a fsync or fdatasync on a file
 * @file:		file to sync
 * @datasync:		only perform a fdatasync operation
 *
 * Write back data and metadata for @file to disk.  If @datasync is
 * set only metadata needed to access modified file data is written.
 */
int vfs_fsync(struct file *file, int datasync)
{
	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);

static int do_fsync(unsigned int fd, int datasync)
{
	struct fd f = fdget(fd);
	int ret = -EBADF;

	if (f.file) {
		ret = vfs_fsync(f.file, datasync);
		fdput(f);
		inc_syscfs(current);
	}
	return ret;
}

SYSCALL_DEFINE1(fsync, unsigned int, fd)
{
	return do_fsync(fd, 0);
}

SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
{
	return do_fsync(fd, 1);
}

/**
 * generic_write_sync - perform syncing after a write if file / inode is sync
 * @file:	file to which the write happened
 * @pos:	offset where the write started
 * @count:	length of the write
 *
 * This is just a simple wrapper about our general syncing function.
 */
int generic_write_sync(struct file *file, loff_t pos, loff_t count)
{
	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
		return 0;
	return vfs_fsync_range(file, pos, pos + count - 1,
			       (file->f_flags & __O_SYNC) ? 0 : 1);
}
EXPORT_SYMBOL(generic_write_sync);

/*
 * sys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
 * zero then sys_sync_file_range() will operate from offset out to EOF.
 *
 * The flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
 * before performing the write.
 *
 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
 * range which are not presently under writeback. Note that this may block for
 * significant periods due to exhaustion of disk request structures.
 *
 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 * after performing the write.
 *
 * Useful combinations of the flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
 * in the range which were dirty on entry to sys_sync_file_range() are placed
 * under writeout.  This is a start-write-for-data-integrity operation.
 *
 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
 * are not presently under writeout.  This is an asynchronous flush-to-disk
 * operation.  Not suitable for data integrity operations.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
 * completion of writeout of all pages in the range.  This will be used after an
 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
 * for that operation to complete and to return the result.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
 * a traditional sync() operation.  This is a write-for-data-integrity operation
 * which will ensure that all pages in the range which were dirty on entry to
 * sys_sync_file_range() are committed to disk.
 *
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
 * I/O errors or ENOSPC conditions and will return those to the caller, after
 * clearing the EIO and ENOSPC flags in the address_space.
 *
 * It should be noted that none of these operations write out the file's
 * metadata.  So unless the application is strictly performing overwrites of
 * already-instantiated disk blocks, there are no guarantees here that the data
 * will be available after a crash.
 */
SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
				unsigned int, flags)
{
	int ret;
	struct fd f;
	struct address_space *mapping;
	loff_t endbyte;			/* inclusive */
	umode_t i_mode;

	ret = -EINVAL;
	if (flags & ~VALID_FLAGS)
		goto out;

	endbyte = offset + nbytes;

	if ((s64)offset < 0)
		goto out;
	if ((s64)endbyte < 0)
		goto out;
	if (endbyte < offset)
		goto out;

	if (sizeof(pgoff_t) == 4) {
		if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
			/*
			 * The range starts outside a 32 bit machine's
			 * pagecache addressing capabilities.  Let it "succeed"
			 */
			ret = 0;
			goto out;
		}
		if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
			/*
			 * Out to EOF
			 */
			nbytes = 0;
		}
	}

	if (nbytes == 0)
		endbyte = LLONG_MAX;
	else
		endbyte--;		/* inclusive */

	ret = -EBADF;
	f = fdget(fd);
	if (!f.file)
		goto out;

	i_mode = file_inode(f.file)->i_mode;
	ret = -ESPIPE;
	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
			!S_ISLNK(i_mode))
		goto out_put;

	mapping = f.file->f_mapping;
	if (!mapping) {
		ret = -EINVAL;
		goto out_put;
	}

	ret = 0;
	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
		ret = filemap_fdatawait_range(mapping, offset, endbyte);
		if (ret < 0)
			goto out_put;
	}

	if (flags & SYNC_FILE_RANGE_WRITE) {
		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
		if (ret < 0)
			goto out_put;
	}

	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
		ret = filemap_fdatawait_range(mapping, offset, endbyte);

out_put:
	fdput(f);
out:
	return ret;
}

/* It would be nice if people remember that not all the world's an i386
   when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
				 loff_t, offset, loff_t, nbytes)
{
	return sys_sync_file_range(fd, offset, nbytes, flags);
}
Commit	Line	Data
f79e2abb AM	1	/*
	2	* High-level sync()-related operations
	3	*/
	4
	5	#include <linux/kernel.h>
	6	#include <linux/file.h>
	7	#include <linux/fs.h>
5a0e3ad6	8	#include <linux/slab.h>
630d9c47	9	#include <linux/export.h>
b7ed78f5	10	#include <linux/namei.h>
914e2637	11	#include <linux/sched.h>
f79e2abb AM	12	#include <linux/writeback.h>
	13	#include <linux/syscalls.h>
	14	#include <linux/linkage.h>
	15	#include <linux/pagemap.h>
cf9a2ae8	16	#include <linux/quotaops.h>
5129a469	17	#include <linux/backing-dev.h>
5a3e5cb8	18	#include "internal.h"
f79e2abb AM	19
	20	#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\| \
	21	SYNC_FILE_RANGE_WAIT_AFTER)
	22
3c2a0909 S	23	/* Interruptible sync for Samsung Mobile Device */
	24	#ifdef CONFIG_INTERRUPTIBLE_SYNC
	25
	26	#include <linux/workqueue.h>
	27	#include <linux/suspend.h>
	28	#include <linux/delay.h>
	29
	30	//#define CONFIG_INTR_SYNC_DEBUG
	31
	32	#ifdef CONFIG_INTR_SYNC_DEBUG
	33	#define dbg_print printk
	34	#else
	35	#define dbg_print(...)
	36	#endif
	37
	38	enum {
	39	INTR_SYNC_STATE_IDLE = 0,
	40	INTR_SYNC_STATE_QUEUED,
	41	INTR_SYNC_STATE_RUNNING,
	42	INTR_SYNC_STATE_MAX
	43	};
	44
	45	struct interruptible_sync_work {
	46	int id;
	47	int ret;
	48	unsigned int waiter;
	49	unsigned int state;
	50	unsigned long version;
	51	spinlock_t lock;
	52	struct completion done;
	53	struct work_struct work;
	54	};
	55
	56	/* Initially, intr_sync_work has zero pending */
	57	static struct interruptible_sync_work intr_sync_work[2];
	58
	59	/* Last work start time */
	60	static atomic_t running_work_idx;
	61
	62	/* intr_sync_wq will be created when intr_sync() is called at first time.
	63	* And it is alive till system shutdown */
	64	static struct workqueue_struct *intr_sync_wq;
	65
	66	/* It prevents double allocation of intr_sync_wq */
	67	static DEFINE_MUTEX(intr_sync_wq_lock);
	68
	69	static inline struct interruptible_sync_work INTR_SYNC_WORK(struct work_struct work)
	70	{
	71	return container_of(work, struct interruptible_sync_work, work);
	72	}
	73
	74	static void do_intr_sync(struct work_struct *work)
	75	{
	76	struct interruptible_sync_work *sync_work = INTR_SYNC_WORK(work);
	77	int ret = 0;
	78	unsigned int waiter;
	79
	80	spin_lock(&sync_work->lock);
	81	atomic_set(&running_work_idx, sync_work->id);
	82	sync_work->state = INTR_SYNC_STATE_RUNNING;
	83	waiter = sync_work->waiter;
	84	spin_unlock(&sync_work->lock);
	85
	86	dbg_print("\nintr_sync: %s: call sys_sync on work[%d]-%ld\n",
87	__func__, sync_work->id, sync_work->version);
88
89	/* if no one waits, do not call sync() */
90	if (waiter) {
91	ret = sys_sync();
92	dbg_print("\nintr_sync: %s: done sys_sync on work[%d]-%ld\n",
93	__func__, sync_work->id, sync_work->version);
94	} else {
95	dbg_print("\nintr_sync: %s: cancel,no_wait on work[%d]-%ld\n",
96	__func__, sync_work->id, sync_work->version);
97	}
98
99	spin_lock(&sync_work->lock);
100	sync_work->version++;
101	sync_work->ret = ret;
102	sync_work->state = INTR_SYNC_STATE_IDLE;
103	complete_all(&sync_work->done);
104	spin_unlock(&sync_work->lock);
105	}
106
107	/* wakeup functions that depend on PM facilities
108	*
109	* struct intr_wakeup_data : wrapper structure for variables for PM
110	* each thread has own instance of it
111	* __prepare_wakeup_event() : prepare and check intr_wakeup_data
112	* __check_wakeup_event() : check wakeup-event with intr_wakeup_data
113	*/
114	struct intr_wakeup_data {
115	unsigned int cnt;
116	};
117
118	static inline int __prepare_wakeup_event(struct intr_wakeup_data *wd)
119	{
120	if (pm_get_wakeup_count(&wd->cnt, false))
121	return 0;
122
123	pr_info("intr_sync: detected wakeup events before sync\n");
124	pm_print_active_wakeup_sources();
125	return -EBUSY;
126	}
127
128	static inline int __check_wakeup_event(struct intr_wakeup_data *wd)
129	{
130	unsigned int cnt, no_inpr;
131
132	no_inpr = pm_get_wakeup_count(&cnt, false);
133	if (no_inpr && (cnt == wd->cnt))
134	return 0;
135
136	pr_info("intr_sync: detected wakeup events(no_inpr: %u cnt: %u->%u)\n",
137	no_inpr, wd->cnt, cnt);
138	pm_print_active_wakeup_sources();
139	return -EBUSY;
140	}
141
142	/* Interruptible Sync
143	*
144	* intr_sync() is same function as sys_sync() except that it can wakeup.
145	* It's possible because of inter_syncd workqueue.
146	*
147	* If system gets wakeup event while sync_work is running,
148	* just return -EBUSY, otherwise 0.
149	*
150	* If intr_sync() is called again while sync_work is running, it will enqueue
151	* idle sync_work to work_queue and wait the completion of it.
152	* If there is not idle sync_work but queued one, it just increases waiter by 1,
153	* and waits the completion of queued sync_work.
154	*
155	* If you want to know returned value of sys_sync(),
156	* you can get it from the argument, sync_ret
157	*/
158
159	int intr_sync(int *sync_ret)
160	{
161	int ret;
162	enqueue_sync_wait:
163	/* If the workqueue exists, try to enqueue work and wait */
164	if (likely(intr_sync_wq)) {
165	struct interruptible_sync_work *sync_work;
166	struct intr_wakeup_data wd;
167	int work_idx;
168	int work_ver;
169
170	find_idle:
171	work_idx = !atomic_read(&running_work_idx);
172	sync_work = &intr_sync_work[work_idx];
173
174	/* Prepare intr_wakeup_data and check wakeup event:
175	* If a wakeup-event is detected, wake up right now
176	*/
177	if (__prepare_wakeup_event(&wd)) {
178	dbg_print("intr_sync: detect wakeup event "
179	"before waiting work[%d]\n", work_idx);
180	return -EBUSY;
181	}
182
183	dbg_print("\nintr_sync: try to wait work[%d]\n", work_idx);
184
185	spin_lock(&sync_work->lock);
186	work_ver = sync_work->version;
187	if (sync_work->state == INTR_SYNC_STATE_RUNNING) {
188	spin_unlock(&sync_work->lock);
189	dbg_print("intr_sync: work[%d] is already running, "
190	"find idle work\n", work_idx);
191	goto find_idle;
192	}
193
194	sync_work->waiter++;
195	if (sync_work->state == INTR_SYNC_STATE_IDLE) {
196	dbg_print("intr_sync: enqueue work[%d]\n", work_idx);
197	sync_work->state = INTR_SYNC_STATE_QUEUED;
198	INIT_COMPLETION(sync_work->done);
199	queue_work(intr_sync_wq, &sync_work->work);
200	}
201	spin_unlock(&sync_work->lock);
202
203	do {
204	/* Check wakeup event first before waiting:
205	* If a wakeup-event is detected, wake up right now
206	*/
207	if (__check_wakeup_event(&wd)) {
208	spin_lock(&sync_work->lock);
209	sync_work->waiter--;
210	spin_unlock(&sync_work->lock);
211	dbg_print("intr_sync: detect wakeup event "
212	"while waiting work[%d]\n", work_idx);
213	return -EBUSY;
214	}
215
216	// dbg_print("intr_sync: waiting work[%d]\n", work_idx);
217	/* Return 0 if timed out, or positive if completed. */
218	ret = wait_for_completion_io_timeout(
219	&sync_work->done, HZ/10);
220	/* A work that we are waiting for has done. */
221	if ((ret > 0) \|\| (sync_work->version != work_ver))
222	break;
223	// dbg_print("intr_sync: timeout work[%d]\n", work_idx);
224	} while (1);
225
226	spin_lock(&sync_work->lock);
227	sync_work->waiter--;
228	if (sync_ret)
229	*sync_ret = sync_work->ret;
230	spin_unlock(&sync_work->lock);
231	dbg_print("intr_sync: sync work[%d] is done with ret(%d)\n",
232	work_idx, sync_work->ret);
233	return 0;
234	}
235
236	/* check whether a workqueue exists or not under locked state.
237	* Create new one if a workqueue is not created yet.
238	*/
239	mutex_lock(&intr_sync_wq_lock);
240	if (likely(!intr_sync_wq)) {
241	intr_sync_work[0].id = 0;
242	intr_sync_work[1].id = 1;
243	INIT_WORK(&intr_sync_work[0].work, do_intr_sync);
244	INIT_WORK(&intr_sync_work[1].work, do_intr_sync);
245	spin_lock_init(&intr_sync_work[0].lock);
246	spin_lock_init(&intr_sync_work[1].lock);
247	init_completion(&intr_sync_work[0].done);
248	init_completion(&intr_sync_work[1].done);
249	intr_sync_wq = alloc_ordered_workqueue("intr_syncd", WQ_MEM_RECLAIM);
250	dbg_print("\nintr_sync: try to allocate intr_sync_queue\n");
251	}
252	mutex_unlock(&intr_sync_wq_lock);
253
254	/* try to enqueue work again if the workqueue is created successfully */
255	if (likely(intr_sync_wq))
256	goto enqueue_sync_wait;
257
258	printk("\nintr_sync: allocation failed, just call sync()\n");
259	ret = sys_sync();
260	if (sync_ret)
261	*sync_ret = ret;
262	return 0;
263	}
264	#else /* CONFIG_INTERRUPTIBLE_SYNC */
265	int intr_sync(int *sync_ret)
266	{
267	int ret = sys_sync();
268	if (sync_ret)
269	*sync_ret = ret;
270	return 0;
271	}
272	#endif /* CONFIG_INTERRUPTIBLE_SYNC */
273
c15c54f5	274	/*
d8a8559c JA	275	* Do the filesystem syncing work. For simple filesystems
	276	* writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
	277	* submit IO for these buffers via __sync_blockdev(). This also speeds up the
	278	* wait == 1 case since in that case write_inode() functions do
	279	* sync_dirty_buffer() and thus effectively write one block at a time.
c15c54f5	280	*/
60b0680f	281	static int __sync_filesystem(struct super_block *sb, int wait)
c15c54f5	282	{
5fb324ad	283	if (wait)
d8a8559c	284	sync_inodes_sb(sb);
5fb324ad	285	else
0e175a18	286	writeback_inodes_sb(sb, WB_REASON_SYNC);
5fb324ad	287
c15c54f5 JK	288	if (sb->s_op->sync_fs)
	289	sb->s_op->sync_fs(sb, wait);
	290	return __sync_blockdev(sb->s_bdev, wait);
	291	}
	292
	293	/*
	294	* Write out and wait upon all dirty data associated with this
	295	* superblock. Filesystem data as well as the underlying block
	296	* device. Takes the superblock lock.
	297	*/
60b0680f	298	int sync_filesystem(struct super_block *sb)
c15c54f5 JK	299	{
	300	int ret;
	301
5af7926f CH	302	/*
	303	* We need to be protected against the filesystem going from
	304	* r/o to r/w or vice versa.
	305	*/
	306	WARN_ON(!rwsem_is_locked(&sb->s_umount));
	307
	308	/*
	309	* No point in syncing out anything if the filesystem is read-only.
	310	*/
	311	if (sb->s_flags & MS_RDONLY)
	312	return 0;
	313
60b0680f	314	ret = __sync_filesystem(sb, 0);
c15c54f5 JK	315	if (ret < 0)
c15c54f5 JK	316	return ret;
60b0680f	317	return __sync_filesystem(sb, 1);
c15c54f5	318	}
60b0680f	319	EXPORT_SYMBOL_GPL(sync_filesystem);
c15c54f5	320
b3de6531	321	static void sync_inodes_one_sb(struct super_block sb, void arg)
01a05b33	322	{
95f28604	323	if (!(sb->s_flags & MS_RDONLY))
b3de6531	324	sync_inodes_sb(sb);
01a05b33	325	}
b3de6531	326
b3de6531 JK	327	static void sync_fs_one_sb(struct super_block sb, void arg)
	328	{
	329	if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs)
	330	sb->s_op->sync_fs(sb, (int )arg);
	331	}
	332
d0e91b13	333	static void fdatawrite_one_bdev(struct block_device bdev, void arg)
b3de6531	334	{
d0e91b13	335	filemap_fdatawrite(bdev->bd_inode->i_mapping);
a8c7176b JK	336	}
a8c7176b JK	337
d0e91b13	338	static void fdatawait_one_bdev(struct block_device bdev, void arg)
a8c7176b	339	{
d0e91b13	340	filemap_fdatawait(bdev->bd_inode->i_mapping);
c15c54f5 JK	341	}
c15c54f5 JK	342
3beab0b4	343	/*
4ea425b6 JK	344	* Sync everything. We start by waking flusher threads so that most of
	345	* writeback runs on all devices in parallel. Then we sync all inodes reliably
	346	* which effectively also waits for all flusher threads to finish doing
	347	* writeback. At this point all data is on disk so metadata should be stable
	348	* and we tell filesystems to sync their metadata via ->sync_fs() calls.
	349	* Finally, we writeout all block devices because some filesystems (e.g. ext2)
	350	* just write metadata (such as inodes or bitmaps) to block device page cache
	351	* and do not sync it on their own in ->sync_fs().
3beab0b4	352	*/
5cee5815	353	SYSCALL_DEFINE0(sync)
cf9a2ae8	354	{
b3de6531 JK	355	int nowait = 0, wait = 1;
b3de6531 JK	356
0e175a18	357	wakeup_flusher_threads(0, WB_REASON_SYNC);
b3de6531	358	iterate_supers(sync_inodes_one_sb, NULL);
4ea425b6	359	iterate_supers(sync_fs_one_sb, &nowait);
b3de6531	360	iterate_supers(sync_fs_one_sb, &wait);
d0e91b13 JK	361	iterate_bdevs(fdatawrite_one_bdev, NULL);
d0e91b13 JK	362	iterate_bdevs(fdatawait_one_bdev, NULL);
cf9a2ae8 DH	363	if (unlikely(laptop_mode))
cf9a2ae8 DH	364	laptop_sync_completion();
cf9a2ae8 DH	365	return 0;
	366	}
	367
a2a9537a JA	368	static void do_sync_work(struct work_struct *work)
a2a9537a JA	369	{
b3de6531 JK	370	int nowait = 0;
b3de6531 JK	371
5cee5815 JK	372	/*
	373	* Sync twice to reduce the possibility we skipped some inodes / pages
	374	* because they were temporarily locked
	375	*/
b3de6531 JK	376	iterate_supers(sync_inodes_one_sb, &nowait);
b3de6531 JK	377	iterate_supers(sync_fs_one_sb, &nowait);
d0e91b13	378	iterate_bdevs(fdatawrite_one_bdev, NULL);
b3de6531 JK	379	iterate_supers(sync_inodes_one_sb, &nowait);
b3de6531 JK	380	iterate_supers(sync_fs_one_sb, &nowait);
d0e91b13	381	iterate_bdevs(fdatawrite_one_bdev, NULL);
5cee5815	382	printk("Emergency Sync complete\n");
a2a9537a JA	383	kfree(work);
	384	}
	385
cf9a2ae8 DH	386	void emergency_sync(void)
cf9a2ae8 DH	387	{
a2a9537a JA	388	struct work_struct *work;
	389
	390	work = kmalloc(sizeof(*work), GFP_ATOMIC);
	391	if (work) {
	392	INIT_WORK(work, do_sync_work);
	393	schedule_work(work);
	394	}
cf9a2ae8 DH	395	}
cf9a2ae8 DH	396
b7ed78f5 SW	397	/*
	398	* sync a single super
	399	*/
	400	SYSCALL_DEFINE1(syncfs, int, fd)
	401	{
2903ff01	402	struct fd f = fdget(fd);
b7ed78f5 SW	403	struct super_block *sb;
b7ed78f5 SW	404	int ret;
b7ed78f5	405
2903ff01	406	if (!f.file)
b7ed78f5	407	return -EBADF;
2903ff01	408	sb = f.file->f_dentry->d_sb;
b7ed78f5 SW	409
	410	down_read(&sb->s_umount);
	411	ret = sync_filesystem(sb);
	412	up_read(&sb->s_umount);
	413
2903ff01	414	fdput(f);
b7ed78f5 SW	415	return ret;
	416	}
	417
4c728ef5	418	/**
148f948b	419	* vfs_fsync_range - helper to sync a range of data & metadata to disk
4c728ef5	420	* @file: file to sync
148f948b JK	421	* @start: offset in bytes of the beginning of data range to sync
	422	* @end: offset in bytes of the end of data range (inclusive)
	423	* @datasync: perform only datasync
4c728ef5	424	*
148f948b JK	425	* Write back data in range @start..@end and metadata for @file to disk. If
	426	* @datasync is set only metadata needed to access modified file data is
	427	* written.
4c728ef5	428	*/
8018ab05	429	int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
cf9a2ae8	430	{
02c24a82 JB	431	if (!file->f_op \|\| !file->f_op->fsync)
	432	return -EINVAL;
	433	return file->f_op->fsync(file, start, end, datasync);
cf9a2ae8	434	}
148f948b JK	435	EXPORT_SYMBOL(vfs_fsync_range);
	436
	437	/**
	438	* vfs_fsync - perform a fsync or fdatasync on a file
	439	* @file: file to sync
148f948b JK	440	* @datasync: only perform a fdatasync operation
	441	*
	442	* Write back data and metadata for @file to disk. If @datasync is
	443	* set only metadata needed to access modified file data is written.
148f948b	444	*/
8018ab05	445	int vfs_fsync(struct file *file, int datasync)
148f948b	446	{
8018ab05	447	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
148f948b	448	}
4c728ef5	449	EXPORT_SYMBOL(vfs_fsync);
cf9a2ae8	450
4c728ef5	451	static int do_fsync(unsigned int fd, int datasync)
cf9a2ae8	452	{
2903ff01	453	struct fd f = fdget(fd);
cf9a2ae8 DH	454	int ret = -EBADF;
cf9a2ae8 DH	455
2903ff01 AV	456	if (f.file) {
	457	ret = vfs_fsync(f.file, datasync);
	458	fdput(f);
b0215462	459	inc_syscfs(current);
cf9a2ae8 DH	460	}
	461	return ret;
	462	}
	463
a5f8fa9e	464	SYSCALL_DEFINE1(fsync, unsigned int, fd)
cf9a2ae8	465	{
4c728ef5	466	return do_fsync(fd, 0);
cf9a2ae8 DH	467	}
cf9a2ae8 DH	468
a5f8fa9e	469	SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
cf9a2ae8	470	{
4c728ef5	471	return do_fsync(fd, 1);
cf9a2ae8 DH	472	}
cf9a2ae8 DH	473
148f948b JK	474	/**
	475	* generic_write_sync - perform syncing after a write if file / inode is sync
	476	* @file: file to which the write happened
	477	* @pos: offset where the write started
	478	* @count: length of the write
	479	*
	480	* This is just a simple wrapper about our general syncing function.
	481	*/
	482	int generic_write_sync(struct file *file, loff_t pos, loff_t count)
	483	{
6b2f3d1f	484	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
148f948b	485	return 0;
8018ab05	486	return vfs_fsync_range(file, pos, pos + count - 1,
6b2f3d1f	487	(file->f_flags & __O_SYNC) ? 0 : 1);
148f948b JK	488	}
	489	EXPORT_SYMBOL(generic_write_sync);
	490
f79e2abb AM	491	/*
	492	* sys_sync_file_range() permits finely controlled syncing over a segment of
	493	* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
	494	* zero then sys_sync_file_range() will operate from offset out to EOF.
	495	*
	496	* The flag bits are:
	497	*
	498	* SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
	499	* before performing the write.
	500	*
	501	* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
cce77081 PM	502	* range which are not presently under writeback. Note that this may block for
cce77081 PM	503	* significant periods due to exhaustion of disk request structures.
f79e2abb AM	504	*
	505	* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
	506	* after performing the write.
	507	*
	508	* Useful combinations of the flag bits are:
	509	*
	510	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE: ensures that all pages
	511	* in the range which were dirty on entry to sys_sync_file_range() are placed
	512	* under writeout. This is a start-write-for-data-integrity operation.
	513	*
	514	* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
	515	* are not presently under writeout. This is an asynchronous flush-to-disk
	516	* operation. Not suitable for data integrity operations.
	517	*
	518	* SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
	519	* completion of writeout of all pages in the range. This will be used after an
	520	* earlier SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE operation to wait
	521	* for that operation to complete and to return the result.
	522	*
	523	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\|SYNC_FILE_RANGE_WAIT_AFTER:
	524	* a traditional sync() operation. This is a write-for-data-integrity operation
	525	* which will ensure that all pages in the range which were dirty on entry to
	526	* sys_sync_file_range() are committed to disk.
	527	*
	528	*
	529	* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
	530	* I/O errors or ENOSPC conditions and will return those to the caller, after
	531	* clearing the EIO and ENOSPC flags in the address_space.
	532	*
	533	* It should be noted that none of these operations write out the file's
	534	* metadata. So unless the application is strictly performing overwrites of
	535	* already-instantiated disk blocks, there are no guarantees here that the data
	536	* will be available after a crash.
	537	*/
4a0fd5bf AV	538	SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
4a0fd5bf AV	539	unsigned int, flags)
f79e2abb AM	540	{
f79e2abb AM	541	int ret;
2903ff01	542	struct fd f;
7a0ad10c	543	struct address_space *mapping;
f79e2abb	544	loff_t endbyte; /* inclusive */
f79e2abb AM	545	umode_t i_mode;
	546
	547	ret = -EINVAL;
	548	if (flags & ~VALID_FLAGS)
	549	goto out;
	550
	551	endbyte = offset + nbytes;
	552
	553	if ((s64)offset < 0)
	554	goto out;
	555	if ((s64)endbyte < 0)
	556	goto out;
	557	if (endbyte < offset)
	558	goto out;
	559
	560	if (sizeof(pgoff_t) == 4) {
	561	if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
	562	/*
	563	* The range starts outside a 32 bit machine's
	564	* pagecache addressing capabilities. Let it "succeed"
	565	*/
	566	ret = 0;
	567	goto out;
	568	}
	569	if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
	570	/*
	571	* Out to EOF
	572	*/
	573	nbytes = 0;
	574	}
	575	}
	576
	577	if (nbytes == 0)
111ebb6e	578	endbyte = LLONG_MAX;
f79e2abb AM	579	else
	580	endbyte--; /* inclusive */
	581
	582	ret = -EBADF;
2903ff01 AV	583	f = fdget(fd);
2903ff01 AV	584	if (!f.file)
f79e2abb AM	585	goto out;
f79e2abb AM	586
496ad9aa	587	i_mode = file_inode(f.file)->i_mode;
f79e2abb AM	588	ret = -ESPIPE;
	589	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
	590	!S_ISLNK(i_mode))
	591	goto out_put;
	592
2903ff01	593	mapping = f.file->f_mapping;
7a0ad10c CH	594	if (!mapping) {
	595	ret = -EINVAL;
	596	goto out_put;
	597	}
	598
	599	ret = 0;
	600	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
	601	ret = filemap_fdatawait_range(mapping, offset, endbyte);
	602	if (ret < 0)
	603	goto out_put;
	604	}
	605
	606	if (flags & SYNC_FILE_RANGE_WRITE) {
	607	ret = filemap_fdatawrite_range(mapping, offset, endbyte);
	608	if (ret < 0)
	609	goto out_put;
	610	}
	611
	612	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
	613	ret = filemap_fdatawait_range(mapping, offset, endbyte);
	614
f79e2abb	615	out_put:
2903ff01	616	fdput(f);
f79e2abb AM	617	out:
	618	return ret;
	619	}
	620
edd5cd4a DW	621	/* It would be nice if people remember that not all the world's an i386
edd5cd4a DW	622	when they introduce new system calls */
4a0fd5bf AV	623	SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
4a0fd5bf AV	624	loff_t, offset, loff_t, nbytes)
edd5cd4a DW	625	{
	626	return sys_sync_file_range(fd, offset, nbytes, flags);
	627	}