From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 25 Jun 2015 23:00:17 +0000 (-0700)
Subject: Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block
X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=e4bc13adfd016fc1036838170288b5680d1a98b0;p=GitHub%2FLineageOS%2FG12%2Fandroid_kernel_amlogic_linux-4.9.git

Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block

Pull cgroup writeback support from Jens Axboe:
 "This is the big pull request for adding cgroup writeback support.

  This code has been in development for a long time, and it has been
  simmering in for-next for a good chunk of this cycle too.  This is one
  of those problems that has been talked about for at least half a
  decade, finally there's a solution and code to go with it.

  Also see last weeks writeup on LWN:

        http://lwn.net/Articles/648292/"

* 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits)
  writeback, blkio: add documentation for cgroup writeback support
  vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB
  writeback: do foreign inode detection iff cgroup writeback is enabled
  v9fs: fix error handling in v9fs_session_init()
  bdi: fix wrong error return value in cgwb_create()
  buffer: remove unusued 'ret' variable
  writeback: disassociate inodes from dying bdi_writebacks
  writeback: implement foreign cgroup inode bdi_writeback switching
  writeback: add lockdep annotation to inode_to_wb()
  writeback: use unlocked_inode_to_wb transaction in inode_congested()
  writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
  writeback: implement [locked_]inode_to_wb_and_lock_list()
  writeback: implement foreign cgroup inode detection
  writeback: make writeback_control track the inode being written back
  writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
  mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
  writeback: implement memcg writeback domain based throttling
  writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes
  writeback: implement memcg wb_domain
  writeback: update wb_over_bg_thresh() to use wb_domain aware operations
  ...
---

e4bc13adfd016fc1036838170288b5680d1a98b0
diff --cc block/blk-cgroup.c
index 6e43fa355e71,31610ae0ebff..9f97da52d006
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -30,9 -27,12 +31,11 @@@
  
  static DEFINE_MUTEX(blkcg_pol_mutex);
  
 -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
 -			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
 +struct blkcg blkcg_root;
  EXPORT_SYMBOL_GPL(blkcg_root);
  
+ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+ 
  static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
  static bool blkcg_policy_enabled(struct request_queue *q,
@@@ -868,16 -843,10 +872,18 @@@ done
  	spin_lock_init(&blkcg->lock);
  	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
  	INIT_HLIST_HEAD(&blkcg->blkg_list);
- 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 	INIT_LIST_HEAD(&blkcg->cgwb_list);
+ #endif
  	return &blkcg->css;
 +
 +free_pd_blkcg:
 +	for (i--; i >= 0; i--)
 +		kfree(blkcg->pd[i]);
 +
 +free_blkcg:
 +	kfree(blkcg);
 +	return ret;
  }
  
  /**
@@@ -995,57 -1000,20 +1037,26 @@@ int blkcg_activate_policy(struct reques
  			  const struct blkcg_policy *pol)
  {
  	LIST_HEAD(pds);
 +	LIST_HEAD(cpds);
- 	struct blkcg_gq *blkg, *new_blkg;
+ 	struct blkcg_gq *blkg;
 -	struct blkg_policy_data *pd, *n;
 +	struct blkg_policy_data *pd, *nd;
 +	struct blkcg_policy_data *cpd, *cnd;
  	int cnt = 0, ret;
- 	bool preloaded;
  
  	if (blkcg_policy_enabled(q, pol))
  		return 0;
  
- 	/* preallocations for root blkg */
- 	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
- 	if (!new_blkg)
- 		return -ENOMEM;
- 
+ 	/* count and allocate policy_data for all existing blkgs */
  	blk_queue_bypass_start(q);
- 
- 	preloaded = !radix_tree_preload(GFP_KERNEL);
- 
- 	/*
- 	 * Make sure the root blkg exists and count the existing blkgs.  As
- 	 * @q is bypassing at this point, blkg_lookup_create() can't be
- 	 * used.  Open code it.
- 	 */
  	spin_lock_irq(q->queue_lock);
- 
- 	rcu_read_lock();
- 	blkg = __blkg_lookup(&blkcg_root, q, false);
- 	if (blkg)
- 		blkg_free(new_blkg);
- 	else
- 		blkg = blkg_create(&blkcg_root, q, new_blkg);
- 	rcu_read_unlock();
- 
- 	if (preloaded)
- 		radix_tree_preload_end();
- 
- 	if (IS_ERR(blkg)) {
- 		ret = PTR_ERR(blkg);
- 		goto out_unlock;
- 	}
- 
  	list_for_each_entry(blkg, &q->blkg_list, q_node)
  		cnt++;
- 
  	spin_unlock_irq(q->queue_lock);
  
 +	/*
 +	 * Allocate per-blkg and per-blkcg policy data
 +	 * for all existing blkgs.
 +	 */
  	while (cnt--) {
  		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
  		if (!pd) {
diff --cc include/linux/backing-dev.h
index d87d8eced064,a13181a42b9a..0e6d4828a77a
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@@ -116,13 -23,13 +23,12 @@@ __printf(3, 4
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
  		const char *fmt, ...);
  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 -void bdi_unregister(struct backing_dev_info *bdi);
  int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
- void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- 			enum wb_reason reason);
- void bdi_start_background_writeback(struct backing_dev_info *bdi);
- void bdi_writeback_workfn(struct work_struct *work);
- int bdi_has_dirty_io(struct backing_dev_info *bdi);
- void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+ 			bool range_cyclic, enum wb_reason reason);
+ void wb_start_background_writeback(struct bdi_writeback *wb);
+ void wb_workfn(struct work_struct *work);
+ void wb_wakeup_delayed(struct bdi_writeback *wb);
  
  extern spinlock_t bdi_lock;
  extern struct list_head bdi_list;
diff --cc include/linux/blk-cgroup.h
index 000000000000,07a32b813ed8..58cfab80dd70
mode 000000,100644..100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@@ -1,0 -1,631 +1,655 @@@
+ #ifndef _BLK_CGROUP_H
+ #define _BLK_CGROUP_H
+ /*
+  * Common Block IO controller cgroup interface
+  *
+  * Based on ideas and code from CFQ, CFS and BFQ:
+  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+  *
+  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+  *		      Paolo Valente <paolo.valente@unimore.it>
+  *
+  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+  * 	              Nauman Rafique <nauman@google.com>
+  */
+ 
+ #include <linux/cgroup.h>
+ #include <linux/u64_stats_sync.h>
+ #include <linux/seq_file.h>
+ #include <linux/radix-tree.h>
+ #include <linux/blkdev.h>
+ #include <linux/atomic.h>
+ 
+ /* Max limits for throttle policy */
+ #define THROTL_IOPS_MAX		UINT_MAX
+ 
 -/* CFQ specific, out here for blkcg->cfq_weight */
 -#define CFQ_WEIGHT_MIN		10
 -#define CFQ_WEIGHT_MAX		1000
 -#define CFQ_WEIGHT_DEFAULT	500
 -
+ #ifdef CONFIG_BLK_CGROUP
+ 
+ enum blkg_rwstat_type {
+ 	BLKG_RWSTAT_READ,
+ 	BLKG_RWSTAT_WRITE,
+ 	BLKG_RWSTAT_SYNC,
+ 	BLKG_RWSTAT_ASYNC,
+ 
+ 	BLKG_RWSTAT_NR,
+ 	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+ };
+ 
+ struct blkcg_gq;
+ 
+ struct blkcg {
+ 	struct cgroup_subsys_state	css;
+ 	spinlock_t			lock;
+ 
+ 	struct radix_tree_root		blkg_tree;
+ 	struct blkcg_gq			*blkg_hint;
+ 	struct hlist_head		blkg_list;
+ 
 -	/* TODO: per-policy storage in blkcg */
 -	unsigned int			cfq_weight;	/* belongs to cfq */
 -	unsigned int			cfq_leaf_weight;
++	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
+ 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 	struct list_head		cgwb_list;
+ #endif
+ };
+ 
+ struct blkg_stat {
+ 	struct u64_stats_sync		syncp;
+ 	uint64_t			cnt;
+ };
+ 
+ struct blkg_rwstat {
+ 	struct u64_stats_sync		syncp;
+ 	uint64_t			cnt[BLKG_RWSTAT_NR];
+ };
+ 
+ /*
+  * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+  * request_queue (q).  This is used by blkcg policies which need to track
+  * information per blkcg - q pair.
+  *
+  * There can be multiple active blkcg policies and each has its private
+  * data on each blkg, the size of which is determined by
+  * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+  * together with blkg and invokes pd_init/exit_fn() methods.
+  *
+  * Such private data must embed struct blkg_policy_data (pd) at the
+  * beginning and pd_size can't be smaller than pd.
+  */
+ struct blkg_policy_data {
+ 	/* the blkg and policy id this per-policy data belongs to */
+ 	struct blkcg_gq			*blkg;
+ 	int				plid;
+ 
+ 	/* used during policy activation */
+ 	struct list_head		alloc_node;
+ };
+ 
++/*
++ * Policies that need to keep per-blkcg data which is independent
++ * from any request_queue associated to it must specify its size
++ * with the cpd_size field of the blkcg_policy structure and
++ * embed a blkcg_policy_data in it. blkcg core allocates
++ * policy-specific per-blkcg structures lazily the first time
++ * they are actually needed, so it handles them together with
++ * blkgs. cpd_init() is invoked to let each policy handle
++ * per-blkcg data.
++ */
++struct blkcg_policy_data {
++	/* the policy id this per-policy data belongs to */
++	int				plid;
++
++	/* used during policy activation */
++	struct list_head		alloc_node;
++};
++
+ /* association between a blk cgroup and a request queue */
+ struct blkcg_gq {
+ 	/* Pointer to the associated request_queue */
+ 	struct request_queue		*q;
+ 	struct list_head		q_node;
+ 	struct hlist_node		blkcg_node;
+ 	struct blkcg			*blkcg;
+ 
+ 	/*
+ 	 * Each blkg gets congested separately and the congestion state is
+ 	 * propagated to the matching bdi_writeback_congested.
+ 	 */
+ 	struct bdi_writeback_congested	*wb_congested;
+ 
+ 	/* all non-root blkcg_gq's are guaranteed to have access to parent */
+ 	struct blkcg_gq			*parent;
+ 
+ 	/* request allocation list for this blkcg-q pair */
+ 	struct request_list		rl;
+ 
+ 	/* reference count */
+ 	atomic_t			refcnt;
+ 
+ 	/* is this blkg online? protected by both blkcg and q locks */
+ 	bool				online;
+ 
+ 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
+ 
+ 	struct rcu_head			rcu_head;
+ };
+ 
++typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+ typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+ 
+ struct blkcg_policy {
+ 	int				plid;
+ 	/* policy specific private data size */
+ 	size_t				pd_size;
++	/* policy specific per-blkcg data size */
++	size_t				cpd_size;
+ 	/* cgroup files for the policy */
+ 	struct cftype			*cftypes;
+ 
+ 	/* operations */
++	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+ 	blkcg_pol_init_pd_fn		*pd_init_fn;
+ 	blkcg_pol_online_pd_fn		*pd_online_fn;
+ 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
+ 	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+ 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
+ };
+ 
+ extern struct blkcg blkcg_root;
+ extern struct cgroup_subsys_state * const blkcg_root_css;
+ 
+ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+ 				    struct request_queue *q);
+ int blkcg_init_queue(struct request_queue *q);
+ void blkcg_drain_queue(struct request_queue *q);
+ void blkcg_exit_queue(struct request_queue *q);
+ 
+ /* Blkio controller policy registration */
+ int blkcg_policy_register(struct blkcg_policy *pol);
+ void blkcg_policy_unregister(struct blkcg_policy *pol);
+ int blkcg_activate_policy(struct request_queue *q,
+ 			  const struct blkcg_policy *pol);
+ void blkcg_deactivate_policy(struct request_queue *q,
+ 			     const struct blkcg_policy *pol);
+ 
+ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+ 		       u64 (*prfill)(struct seq_file *,
+ 				     struct blkg_policy_data *, int),
+ 		       const struct blkcg_policy *pol, int data,
+ 		       bool show_total);
+ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ 			 const struct blkg_rwstat *rwstat);
+ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ 		       int off);
+ 
+ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+ 					     int off);
+ 
+ struct blkg_conf_ctx {
+ 	struct gendisk			*disk;
+ 	struct blkcg_gq			*blkg;
+ 	u64				v;
+ };
+ 
+ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ 		   const char *input, struct blkg_conf_ctx *ctx);
+ void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+ 
+ 
+ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+ {
+ 	return css ? container_of(css, struct blkcg, css) : NULL;
+ }
+ 
+ static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+ {
+ 	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+ }
+ 
+ static inline struct blkcg *bio_blkcg(struct bio *bio)
+ {
+ 	if (bio && bio->bi_css)
+ 		return css_to_blkcg(bio->bi_css);
+ 	return task_blkcg(current);
+ }
+ 
+ static inline struct cgroup_subsys_state *
+ task_get_blkcg_css(struct task_struct *task)
+ {
+ 	return task_get_css(task, blkio_cgrp_id);
+ }
+ 
+ /**
+  * blkcg_parent - get the parent of a blkcg
+  * @blkcg: blkcg of interest
+  *
+  * Return the parent blkcg of @blkcg.  Can be called anytime.
+  */
+ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+ {
+ 	return css_to_blkcg(blkcg->css.parent);
+ }
+ 
+ /**
+  * blkg_to_pdata - get policy private data
+  * @blkg: blkg of interest
+  * @pol: policy of interest
+  *
+  * Return pointer to private data associated with the @blkg-@pol pair.
+  */
+ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ 						  struct blkcg_policy *pol)
+ {
+ 	return blkg ? blkg->pd[pol->plid] : NULL;
+ }
+ 
++static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
++						     struct blkcg_policy *pol)
++{
++	return blkcg ? blkcg->pd[pol->plid] : NULL;
++}
++
+ /**
+  * pdata_to_blkg - get blkg associated with policy private data
+  * @pd: policy private data of interest
+  *
+  * @pd is policy private data.  Determine the blkg it's associated with.
+  */
+ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+ {
+ 	return pd ? pd->blkg : NULL;
+ }
+ 
+ /**
+  * blkg_path - format cgroup path of blkg
+  * @blkg: blkg of interest
+  * @buf: target buffer
+  * @buflen: target buffer length
+  *
+  * Format the path of the cgroup of @blkg into @buf.
+  */
+ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+ {
+ 	char *p;
+ 
+ 	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+ 	if (!p) {
+ 		strncpy(buf, "<unavailable>", buflen);
+ 		return -ENAMETOOLONG;
+ 	}
+ 
+ 	memmove(buf, p, buf + buflen - p);
+ 	return 0;
+ }
+ 
+ /**
+  * blkg_get - get a blkg reference
+  * @blkg: blkg to get
+  *
+  * The caller should be holding an existing reference.
+  */
+ static inline void blkg_get(struct blkcg_gq *blkg)
+ {
+ 	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+ 	atomic_inc(&blkg->refcnt);
+ }
+ 
+ void __blkg_release_rcu(struct rcu_head *rcu);
+ 
+ /**
+  * blkg_put - put a blkg reference
+  * @blkg: blkg to put
+  */
+ static inline void blkg_put(struct blkcg_gq *blkg)
+ {
+ 	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+ 	if (atomic_dec_and_test(&blkg->refcnt))
+ 		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+ }
+ 
+ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+ 			       bool update_hint);
+ 
+ /**
+  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+  * @d_blkg: loop cursor pointing to the current descendant
+  * @pos_css: used for iteration
+  * @p_blkg: target blkg to walk descendants of
+  *
+  * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+  * read locked.  If called under either blkcg or queue lock, the iteration
+  * is guaranteed to include all and only online blkgs.  The caller may
+  * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+  * @p_blkg is included in the iteration and the first node to be visited.
+  */
+ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
+ 	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
+ 		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+ 					      (p_blkg)->q, false)))
+ 
+ /**
+  * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+  * @d_blkg: loop cursor pointing to the current descendant
+  * @pos_css: used for iteration
+  * @p_blkg: target blkg to walk descendants of
+  *
+  * Similar to blkg_for_each_descendant_pre() but performs post-order
+  * traversal instead.  Synchronization rules are the same.  @p_blkg is
+  * included in the iteration and the last node to be visited.
+  */
+ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
+ 	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
+ 		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+ 					      (p_blkg)->q, false)))
+ 
+ /**
+  * blk_get_rl - get request_list to use
+  * @q: request_queue of interest
+  * @bio: bio which will be attached to the allocated request (may be %NULL)
+  *
+  * The caller wants to allocate a request from @q to use for @bio.  Find
+  * the request_list to use and obtain a reference on it.  Should be called
+  * under queue_lock.  This function is guaranteed to return non-%NULL
+  * request_list.
+  */
+ static inline struct request_list *blk_get_rl(struct request_queue *q,
+ 					      struct bio *bio)
+ {
+ 	struct blkcg *blkcg;
+ 	struct blkcg_gq *blkg;
+ 
+ 	rcu_read_lock();
+ 
+ 	blkcg = bio_blkcg(bio);
+ 
+ 	/* bypass blkg lookup and use @q->root_rl directly for root */
+ 	if (blkcg == &blkcg_root)
+ 		goto root_rl;
+ 
+ 	/*
+ 	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+ 	 * or if either the blkcg or queue is going away.  Fall back to
+ 	 * root_rl in such cases.
+ 	 */
+ 	blkg = blkg_lookup_create(blkcg, q);
+ 	if (unlikely(IS_ERR(blkg)))
+ 		goto root_rl;
+ 
+ 	blkg_get(blkg);
+ 	rcu_read_unlock();
+ 	return &blkg->rl;
+ root_rl:
+ 	rcu_read_unlock();
+ 	return &q->root_rl;
+ }
+ 
+ /**
+  * blk_put_rl - put request_list
+  * @rl: request_list to put
+  *
+  * Put the reference acquired by blk_get_rl().  Should be called under
+  * queue_lock.
+  */
+ static inline void blk_put_rl(struct request_list *rl)
+ {
+ 	/* root_rl may not have blkg set */
+ 	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+ 		blkg_put(rl->blkg);
+ }
+ 
+ /**
+  * blk_rq_set_rl - associate a request with a request_list
+  * @rq: request of interest
+  * @rl: target request_list
+  *
+  * Associate @rq with @rl so that accounting and freeing can know the
+  * request_list @rq came from.
+  */
+ static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+ {
+ 	rq->rl = rl;
+ }
+ 
+ /**
+  * blk_rq_rl - return the request_list a request came from
+  * @rq: request of interest
+  *
+  * Return the request_list @rq is allocated from.
+  */
+ static inline struct request_list *blk_rq_rl(struct request *rq)
+ {
+ 	return rq->rl;
+ }
+ 
+ struct request_list *__blk_queue_next_rl(struct request_list *rl,
+ 					 struct request_queue *q);
+ /**
+  * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+  *
+  * Should be used under queue_lock.
+  */
+ #define blk_queue_for_each_rl(rl, q)	\
+ 	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+ 
+ static inline void blkg_stat_init(struct blkg_stat *stat)
+ {
+ 	u64_stats_init(&stat->syncp);
+ }
+ 
+ /**
+  * blkg_stat_add - add a value to a blkg_stat
+  * @stat: target blkg_stat
+  * @val: value to add
+  *
+  * Add @val to @stat.  The caller is responsible for synchronizing calls to
+  * this function.
+  */
+ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+ {
+ 	u64_stats_update_begin(&stat->syncp);
+ 	stat->cnt += val;
+ 	u64_stats_update_end(&stat->syncp);
+ }
+ 
+ /**
+  * blkg_stat_read - read the current value of a blkg_stat
+  * @stat: blkg_stat to read
+  *
+  * Read the current value of @stat.  This function can be called without
+  * synchroniztion and takes care of u64 atomicity.
+  */
+ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+ {
+ 	unsigned int start;
+ 	uint64_t v;
+ 
+ 	do {
+ 		start = u64_stats_fetch_begin_irq(&stat->syncp);
+ 		v = stat->cnt;
+ 	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
+ 
+ 	return v;
+ }
+ 
+ /**
+  * blkg_stat_reset - reset a blkg_stat
+  * @stat: blkg_stat to reset
+  */
+ static inline void blkg_stat_reset(struct blkg_stat *stat)
+ {
+ 	stat->cnt = 0;
+ }
+ 
+ /**
+  * blkg_stat_merge - merge a blkg_stat into another
+  * @to: the destination blkg_stat
+  * @from: the source
+  *
+  * Add @from's count to @to.
+  */
+ static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+ {
+ 	blkg_stat_add(to, blkg_stat_read(from));
+ }
+ 
+ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+ {
+ 	u64_stats_init(&rwstat->syncp);
+ }
+ 
+ /**
+  * blkg_rwstat_add - add a value to a blkg_rwstat
+  * @rwstat: target blkg_rwstat
+  * @rw: mask of REQ_{WRITE|SYNC}
+  * @val: value to add
+  *
+  * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+  * caller is responsible for synchronizing calls to this function.
+  */
+ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+ 				   int rw, uint64_t val)
+ {
+ 	u64_stats_update_begin(&rwstat->syncp);
+ 
+ 	if (rw & REQ_WRITE)
+ 		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+ 	else
+ 		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+ 	if (rw & REQ_SYNC)
+ 		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+ 	else
+ 		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+ 
+ 	u64_stats_update_end(&rwstat->syncp);
+ }
+ 
+ /**
+  * blkg_rwstat_read - read the current values of a blkg_rwstat
+  * @rwstat: blkg_rwstat to read
+  *
+  * Read the current snapshot of @rwstat and return it as the return value.
+  * This function can be called without synchronization and takes care of
+  * u64 atomicity.
+  */
+ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+ {
+ 	unsigned int start;
+ 	struct blkg_rwstat tmp;
+ 
+ 	do {
+ 		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
+ 		tmp = *rwstat;
+ 	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+ 
+ 	return tmp;
+ }
+ 
+ /**
+  * blkg_rwstat_total - read the total count of a blkg_rwstat
+  * @rwstat: blkg_rwstat to read
+  *
+  * Return the total count of @rwstat regardless of the IO direction.  This
+  * function can be called without synchronization and takes care of u64
+  * atomicity.
+  */
+ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+ {
+ 	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+ 
+ 	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+ }
+ 
+ /**
+  * blkg_rwstat_reset - reset a blkg_rwstat
+  * @rwstat: blkg_rwstat to reset
+  */
+ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+ {
+ 	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+ }
+ 
+ /**
+  * blkg_rwstat_merge - merge a blkg_rwstat into another
+  * @to: the destination blkg_rwstat
+  * @from: the source
+  *
+  * Add @from's counts to @to.
+  */
+ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+ 				     struct blkg_rwstat *from)
+ {
+ 	struct blkg_rwstat v = blkg_rwstat_read(from);
+ 	int i;
+ 
+ 	u64_stats_update_begin(&to->syncp);
+ 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ 		to->cnt[i] += v.cnt[i];
+ 	u64_stats_update_end(&to->syncp);
+ }
+ 
+ #else	/* CONFIG_BLK_CGROUP */
+ 
+ struct blkcg {
+ };
+ 
+ struct blkg_policy_data {
+ };
+ 
++struct blkcg_policy_data {
++};
++
+ struct blkcg_gq {
+ };
+ 
+ struct blkcg_policy {
+ };
+ 
+ #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+ 
+ static inline struct cgroup_subsys_state *
+ task_get_blkcg_css(struct task_struct *task)
+ {
+ 	return NULL;
+ }
+ 
+ #ifdef CONFIG_BLOCK
+ 
+ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+ static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+ static inline void blkcg_drain_queue(struct request_queue *q) { }
+ static inline void blkcg_exit_queue(struct request_queue *q) { }
+ static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+ static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+ static inline int blkcg_activate_policy(struct request_queue *q,
+ 					const struct blkcg_policy *pol) { return 0; }
+ static inline void blkcg_deactivate_policy(struct request_queue *q,
+ 					   const struct blkcg_policy *pol) { }
+ 
+ static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+ 
+ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ 						  struct blkcg_policy *pol) { return NULL; }
+ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+ static inline void blkg_get(struct blkcg_gq *blkg) { }
+ static inline void blkg_put(struct blkcg_gq *blkg) { }
+ 
+ static inline struct request_list *blk_get_rl(struct request_queue *q,
+ 					      struct bio *bio) { return &q->root_rl; }
+ static inline void blk_put_rl(struct request_list *rl) { }
+ static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+ 
+ #define blk_queue_for_each_rl(rl, q)	\
+ 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+ 
+ #endif	/* CONFIG_BLOCK */
+ #endif	/* CONFIG_BLK_CGROUP */
+ #endif	/* _BLK_CGROUP_H */
diff --cc include/linux/blkdev.h
index 5ced29cef03f,ab4a27852f1b..7f2f54b4587f
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -787,25 -788,8 +787,6 @@@ extern int scsi_cmd_ioctl(struct reques
  extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
  			 struct scsi_ioctl_command __user *);
  
- /*
-  * A queue has just exitted congestion.  Note this in the global counter of
-  * congested queues, and wake up anyone who was waiting for requests to be
-  * put back.
-  */
- static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
- {
- 	clear_bdi_congested(&q->backing_dev_info, sync);
- }
- 
- /*
-  * A queue has just entered congestion.  Flag that in the queue's VM-visible
-  * state flags and increment the global gounter of congested queues.
-  */
- static inline void blk_set_queue_congested(struct request_queue *q, int sync)
- {
- 	set_bdi_congested(&q->backing_dev_info, sync);
- }
 -extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
--
  extern void blk_start_queue(struct request_queue *q);
  extern void blk_stop_queue(struct request_queue *q);
  extern void blk_sync_queue(struct request_queue *q);
diff --cc mm/backing-dev.c
index 000e7b3b9896,436bb53dd383..7756da31b02b
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -387,49 -746,91 +746,74 @@@ int bdi_init(struct backing_dev_info *b
  	bdi->min_ratio = 0;
  	bdi->max_ratio = 100;
  	bdi->max_prop_frac = FPROP_FRAC_BASE;
- 	spin_lock_init(&bdi->wb_lock);
  	INIT_LIST_HEAD(&bdi->bdi_list);
- 	INIT_LIST_HEAD(&bdi->work_list);
+ 	init_waitqueue_head(&bdi->wb_waitq);
  
- 	bdi_wb_init(&bdi->wb, bdi);
+ 	err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+ 	if (err)
+ 		return err;
  
- 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- 		err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
- 		if (err)
- 			goto err;
- 	}
+ 	bdi->wb_congested.state = 0;
+ 	bdi->wb.congested = &bdi->wb_congested;
  
- 	bdi->dirty_exceeded = 0;
+ 	cgwb_bdi_init(bdi);
+ 	return 0;
+ }
+ EXPORT_SYMBOL(bdi_init);
  
- 	bdi->bw_time_stamp = jiffies;
- 	bdi->written_stamp = 0;
+ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ 		const char *fmt, ...)
+ {
+ 	va_list args;
+ 	struct device *dev;
  
- 	bdi->balanced_dirty_ratelimit = INIT_BW;
- 	bdi->dirty_ratelimit = INIT_BW;
- 	bdi->write_bandwidth = INIT_BW;
- 	bdi->avg_write_bandwidth = INIT_BW;
+ 	if (bdi->dev)	/* The driver needs to use separate queues per device */
+ 		return 0;
  
- 	err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+ 	va_start(args, fmt);
+ 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+ 	va_end(args);
+ 	if (IS_ERR(dev))
+ 		return PTR_ERR(dev);
  
- 	if (err) {
- err:
- 		while (i--)
- 			percpu_counter_destroy(&bdi->bdi_stat[i]);
- 	}
+ 	bdi->dev = dev;
  
- 	return err;
+ 	bdi_debug_register(bdi, dev_name(dev));
+ 	set_bit(WB_registered, &bdi->wb.state);
+ 
+ 	spin_lock_bh(&bdi_lock);
+ 	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ 	spin_unlock_bh(&bdi_lock);
+ 
+ 	trace_writeback_bdi_register(bdi);
+ 	return 0;
  }
- EXPORT_SYMBOL(bdi_init);
+ EXPORT_SYMBOL(bdi_register);
  
- void bdi_destroy(struct backing_dev_info *bdi)
+ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
  {
- 	int i;
+ 	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ }
+ EXPORT_SYMBOL(bdi_register_dev);
+ 
+ /*
+  * Remove bdi from bdi_list, and ensure that it is no longer visible
+  */
+ static void bdi_remove_from_list(struct backing_dev_info *bdi)
+ {
+ 	spin_lock_bh(&bdi_lock);
+ 	list_del_rcu(&bdi->bdi_list);
+ 	spin_unlock_bh(&bdi_lock);
  
- 	bdi_wb_shutdown(bdi);
- 	bdi_set_min_ratio(bdi, 0);
+ 	synchronize_rcu_expedited();
+ }
  
- 	WARN_ON(!list_empty(&bdi->work_list));
- 	WARN_ON(delayed_work_pending(&bdi->wb.dwork));
 -/*
 - * Called when the device behind @bdi has been removed or ejected.
 - *
 - * We can't really do much here except for reducing the dirty ratio at
 - * the moment.  In the future we should be able to set a flag so that
 - * the filesystem can handle errors at mark_inode_dirty time instead
 - * of only at writeback time.
 - */
 -void bdi_unregister(struct backing_dev_info *bdi)
 -{
 -	if (WARN_ON_ONCE(!bdi->dev))
 -		return;
 -
 -	bdi_set_min_ratio(bdi, 0);
 -}
 -EXPORT_SYMBOL(bdi_unregister);
 -
+ void bdi_destroy(struct backing_dev_info *bdi)
+ {
+ 	/* make sure nobody finds us on the bdi_list anymore */
+ 	bdi_remove_from_list(bdi);
+ 	wb_shutdown(&bdi->wb);
+ 	cgwb_bdi_destroy(bdi);
  
  	if (bdi->dev) {
  		bdi_debug_unregister(bdi);
diff --cc mm/filemap.c
index 8d17ceea8dbe,bfc1ab053b12..11f10efd637c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -485,15 -498,11 +500,16 @@@ int replace_page_cache_page(struct pag
  		error = radix_tree_insert(&mapping->page_tree, offset, new);
  		BUG_ON(error);
  		mapping->nrpages++;
 -		__inc_zone_page_state(new, NR_FILE_PAGES);
 +
 +		/*
 +		 * hugetlb pages do not participate in page cache accounting.
 +		 */
 +		if (!PageHuge(new))
 +			__inc_zone_page_state(new, NR_FILE_PAGES);
  		if (PageSwapBacked(new))
  			__inc_zone_page_state(new, NR_SHMEM);
- 		spin_unlock_irq(&mapping->tree_lock);
+ 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ 		mem_cgroup_end_page_stat(memcg);
  		mem_cgroup_migrate(old, new, true);
  		radix_tree_preload_end();
  		if (freepage)
diff --cc mm/page-writeback.c
index eb59f7eea508,e1514d5b4e9b..22cddd3e5de8
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -802,27 -990,27 +990,27 @@@ static void wb_position_ratio(struct di
  	 * threshold, so that the occasional writes won't be blocked and active
  	 * writes can rampup the threshold quickly.
  	 */
- 	bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+ 	wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
  	/*
- 	 * scale global setpoint to bdi's:
- 	 *	bdi_setpoint = setpoint * bdi_thresh / thresh
+ 	 * scale global setpoint to wb's:
+ 	 *	wb_setpoint = setpoint * wb_thresh / thresh
  	 */
- 	x = div_u64((u64)bdi_thresh << 16, thresh | 1);
- 	bdi_setpoint = setpoint * (u64)x >> 16;
 -	x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1);
++	x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+ 	wb_setpoint = setpoint * (u64)x >> 16;
  	/*
- 	 * Use span=(8*write_bw) in single bdi case as indicated by
- 	 * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+ 	 * Use span=(8*write_bw) in single wb case as indicated by
+ 	 * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
  	 *
- 	 *        bdi_thresh                    thresh - bdi_thresh
- 	 * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
- 	 *          thresh                            thresh
+ 	 *        wb_thresh                    thresh - wb_thresh
+ 	 * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+ 	 *         thresh                           thresh
  	 */
- 	span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
- 	x_intercept = bdi_setpoint + span;
+ 	span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+ 	x_intercept = wb_setpoint + span;
  
- 	if (bdi_dirty < x_intercept - span / 4) {
- 		pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
- 				      (x_intercept - bdi_setpoint) | 1);
+ 	if (dtc->wb_dirty < x_intercept - span / 4) {
+ 		pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
 -				      x_intercept - wb_setpoint + 1);
++				      (x_intercept - wb_setpoint) | 1);
  	} else
  		pos_ratio /= 4;