From: Linus Torvalds Date: Thu, 25 Jun 2015 23:00:17 +0000 (-0700) Subject: Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=e4bc13adfd016fc1036838170288b5680d1a98b0;p=GitHub%2Fmoto-9609%2Fandroid_kernel_motorola_exynos9610.git Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block Pull cgroup writeback support from Jens Axboe: "This is the big pull request for adding cgroup writeback support. This code has been in development for a long time, and it has been simmering in for-next for a good chunk of this cycle too. This is one of those problems that has been talked about for at least half a decade, finally there's a solution and code to go with it. Also see last weeks writeup on LWN: http://lwn.net/Articles/648292/" * 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits) writeback, blkio: add documentation for cgroup writeback support vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB writeback: do foreign inode detection iff cgroup writeback is enabled v9fs: fix error handling in v9fs_session_init() bdi: fix wrong error return value in cgwb_create() buffer: remove unusued 'ret' variable writeback: disassociate inodes from dying bdi_writebacks writeback: implement foreign cgroup inode bdi_writeback switching writeback: add lockdep annotation to inode_to_wb() writeback: use unlocked_inode_to_wb transaction in inode_congested() writeback: implement unlocked_inode_to_wb transaction and use it for stat updates writeback: implement [locked_]inode_to_wb_and_lock_list() writeback: implement foreign cgroup inode detection writeback: make writeback_control track the inode being written back writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use writeback: implement memcg writeback domain based throttling writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes writeback: implement memcg wb_domain writeback: update wb_over_bg_thresh() to use wb_domain aware operations ... --- e4bc13adfd016fc1036838170288b5680d1a98b0 diff --cc block/blk-cgroup.c index 6e43fa355e71,31610ae0ebff..9f97da52d006 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@@ -30,9 -27,12 +31,11 @@@ static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, - .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); + struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; + static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static bool blkcg_policy_enabled(struct request_queue *q, @@@ -868,16 -843,10 +872,18 @@@ done spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&blkcg->blkg_list); - + #ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); + #endif return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + kfree(blkcg->pd[i]); + +free_blkcg: + kfree(blkcg); + return ret; } /** @@@ -995,57 -1000,20 +1037,26 @@@ int blkcg_activate_policy(struct reques const struct blkcg_policy *pol) { LIST_HEAD(pds); + LIST_HEAD(cpds); - struct blkcg_gq *blkg, *new_blkg; + struct blkcg_gq *blkg; - struct blkg_policy_data *pd, *n; + struct blkg_policy_data *pd, *nd; + struct blkcg_policy_data *cpd, *cnd; int cnt = 0, ret; - bool preloaded; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - + /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; - spin_unlock_irq(q->queue_lock); + /* + * Allocate per-blkg and per-blkcg policy data + * for all existing blkgs. + */ while (cnt--) { pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); if (!pd) { diff --cc include/linux/backing-dev.h index d87d8eced064,a13181a42b9a..0e6d4828a77a --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@@ -116,13 -23,13 +23,12 @@@ __printf(3, 4 int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); -void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); - void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason); - void bdi_start_background_writeback(struct backing_dev_info *bdi); - void bdi_writeback_workfn(struct work_struct *work); - int bdi_has_dirty_io(struct backing_dev_info *bdi); - void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); + void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason); + void wb_start_background_writeback(struct bdi_writeback *wb); + void wb_workfn(struct work_struct *work); + void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --cc include/linux/blk-cgroup.h index 000000000000,07a32b813ed8..58cfab80dd70 mode 000000,100644..100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@@ -1,0 -1,631 +1,655 @@@ + #ifndef _BLK_CGROUP_H + #define _BLK_CGROUP_H + /* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + + #include + #include + #include + #include + #include + #include + + /* Max limits for throttle policy */ + #define THROTL_IOPS_MAX UINT_MAX + -/* CFQ specific, out here for blkcg->cfq_weight */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 - + #ifdef CONFIG_BLK_CGROUP + + enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, + }; + + struct blkcg_gq; + + struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + + struct radix_tree_root blkg_tree; + struct blkcg_gq *blkg_hint; + struct hlist_head blkg_list; + - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ - unsigned int cfq_leaf_weight; ++ struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + + #ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; + #endif + }; + + struct blkg_stat { + struct u64_stats_sync syncp; + uint64_t cnt; + }; + + struct blkg_rwstat { + struct u64_stats_sync syncp; + uint64_t cnt[BLKG_RWSTAT_NR]; + }; + + /* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each has its private + * data on each blkg, the size of which is determined by + * blkcg_policy->pd_size. blkcg core allocates and frees such areas + * together with blkg and invokes pd_init/exit_fn() methods. + * + * Such private data must embed struct blkg_policy_data (pd) at the + * beginning and pd_size can't be smaller than pd. + */ + struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; + + /* used during policy activation */ + struct list_head alloc_node; + }; + ++/* ++ * Policies that need to keep per-blkcg data which is independent ++ * from any request_queue associated to it must specify its size ++ * with the cpd_size field of the blkcg_policy structure and ++ * embed a blkcg_policy_data in it. blkcg core allocates ++ * policy-specific per-blkcg structures lazily the first time ++ * they are actually needed, so it handles them together with ++ * blkgs. cpd_init() is invoked to let each policy handle ++ * per-blkcg data. ++ */ ++struct blkcg_policy_data { ++ /* the policy id this per-policy data belongs to */ ++ int plid; ++ ++ /* used during policy activation */ ++ struct list_head alloc_node; ++}; ++ + /* association between a blk cgroup and a request queue */ + struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* + * Each blkg gets congested separately and the congestion state is + * propagated to the matching bdi_writeback_congested. + */ + struct bdi_writeback_congested *wb_congested; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* request allocation list for this blkcg-q pair */ + struct request_list rl; + + /* reference count */ + atomic_t refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + struct rcu_head rcu_head; + }; + ++typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); + typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); + typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); + typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); + typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); + typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); + + struct blkcg_policy { + int plid; + /* policy specific private data size */ + size_t pd_size; ++ /* policy specific per-blkcg data size */ ++ size_t cpd_size; + /* cgroup files for the policy */ + struct cftype *cftypes; + + /* operations */ ++ blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; + blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; + }; + + extern struct blkcg blkcg_root; + extern struct cgroup_subsys_state * const blkcg_root_css; + + struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); + struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); + int blkcg_init_queue(struct request_queue *q); + void blkcg_drain_queue(struct request_queue *q); + void blkcg_exit_queue(struct request_queue *q); + + /* Blkio controller policy registration */ + int blkcg_policy_register(struct blkcg_policy *pol); + void blkcg_policy_unregister(struct blkcg_policy *pol); + int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + + void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); + u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); + u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat); + u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); + u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); + + u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); + struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + + struct blkg_conf_ctx { + struct gendisk *disk; + struct blkcg_gq *blkg; + u64 v; + }; + + int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx); + void blkg_conf_finish(struct blkg_conf_ctx *ctx); + + + static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) + { + return css ? container_of(css, struct blkcg, css) : NULL; + } + + static inline struct blkcg *task_blkcg(struct task_struct *tsk) + { + return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); + } + + static inline struct blkcg *bio_blkcg(struct bio *bio) + { + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + return task_blkcg(current); + } + + static inline struct cgroup_subsys_state * + task_get_blkcg_css(struct task_struct *task) + { + return task_get_css(task, blkio_cgrp_id); + } + + /** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ + static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) + { + return css_to_blkcg(blkcg->css.parent); + } + + /** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ + static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) + { + return blkg ? blkg->pd[pol->plid] : NULL; + } + ++static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, ++ struct blkcg_policy *pol) ++{ ++ return blkcg ? blkcg->pd[pol->plid] : NULL; ++} ++ + /** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ + static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) + { + return pd ? pd->blkg : NULL; + } + + /** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ + static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) + { + char *p; + + p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + if (!p) { + strncpy(buf, "", buflen); + return -ENAMETOOLONG; + } + + memmove(buf, p, buf + buflen - p); + return 0; + } + + /** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding an existing reference. + */ + static inline void blkg_get(struct blkcg_gq *blkg) + { + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); + } + + void __blkg_release_rcu(struct rcu_head *rcu); + + /** + * blkg_put - put a blkg reference + * @blkg: blkg to put + */ + static inline void blkg_put(struct blkcg_gq *blkg) + { + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); + } + + struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, + bool update_hint); + + /** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. + */ + #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + + /** + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Similar to blkg_for_each_descendant_pre() but performs post-order + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. + */ + #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + + /** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ + static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) + { + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; + root_rl: + rcu_read_unlock(); + return &q->root_rl; + } + + /** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ + static inline void blk_put_rl(struct request_list *rl) + { + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); + } + + /** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ + static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) + { + rq->rl = rl; + } + + /** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ + static inline struct request_list *blk_rq_rl(struct request *rq) + { + return rq->rl; + } + + struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); + /** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ + #define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + + static inline void blkg_stat_init(struct blkg_stat *stat) + { + u64_stats_init(&stat->syncp); + } + + /** + * blkg_stat_add - add a value to a blkg_stat + * @stat: target blkg_stat + * @val: value to add + * + * Add @val to @stat. The caller is responsible for synchronizing calls to + * this function. + */ + static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) + { + u64_stats_update_begin(&stat->syncp); + stat->cnt += val; + u64_stats_update_end(&stat->syncp); + } + + /** + * blkg_stat_read - read the current value of a blkg_stat + * @stat: blkg_stat to read + * + * Read the current value of @stat. This function can be called without + * synchroniztion and takes care of u64 atomicity. + */ + static inline uint64_t blkg_stat_read(struct blkg_stat *stat) + { + unsigned int start; + uint64_t v; + + do { + start = u64_stats_fetch_begin_irq(&stat->syncp); + v = stat->cnt; + } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); + + return v; + } + + /** + * blkg_stat_reset - reset a blkg_stat + * @stat: blkg_stat to reset + */ + static inline void blkg_stat_reset(struct blkg_stat *stat) + { + stat->cnt = 0; + } + + /** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ + static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) + { + blkg_stat_add(to, blkg_stat_read(from)); + } + + static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) + { + u64_stats_init(&rwstat->syncp); + } + + /** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @rw: mask of REQ_{WRITE|SYNC} + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ + static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + int rw, uint64_t val) + { + u64_stats_update_begin(&rwstat->syncp); + + if (rw & REQ_WRITE) + rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + else + rwstat->cnt[BLKG_RWSTAT_READ] += val; + if (rw & REQ_SYNC) + rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + else + rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + + u64_stats_update_end(&rwstat->syncp); + } + + /** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it as the return value. + * This function can be called without synchronization and takes care of + * u64 atomicity. + */ + static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) + { + unsigned int start; + struct blkg_rwstat tmp; + + do { + start = u64_stats_fetch_begin_irq(&rwstat->syncp); + tmp = *rwstat; + } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + + return tmp; + } + + /** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ + static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) + { + struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); + + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; + } + + /** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ + static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) + { + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); + } + + /** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ + static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) + { + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); + } + + #else /* CONFIG_BLK_CGROUP */ + + struct blkcg { + }; + + struct blkg_policy_data { + }; + ++struct blkcg_policy_data { ++}; ++ + struct blkcg_gq { + }; + + struct blkcg_policy { + }; + + #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + + static inline struct cgroup_subsys_state * + task_get_blkcg_css(struct task_struct *task) + { + return NULL; + } + + #ifdef CONFIG_BLOCK + + static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } + static inline int blkcg_init_queue(struct request_queue *q) { return 0; } + static inline void blkcg_drain_queue(struct request_queue *q) { } + static inline void blkcg_exit_queue(struct request_queue *q) { } + static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } + static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } + static inline int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { return 0; } + static inline void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { } + + static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + + static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } + static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } + static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } + static inline void blkg_get(struct blkcg_gq *blkg) { } + static inline void blkg_put(struct blkcg_gq *blkg) { } + + static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } + static inline void blk_put_rl(struct request_list *rl) { } + static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } + static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + + #define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + + #endif /* CONFIG_BLOCK */ + #endif /* CONFIG_BLK_CGROUP */ + #endif /* _BLK_CGROUP_H */ diff --cc include/linux/blkdev.h index 5ced29cef03f,ab4a27852f1b..7f2f54b4587f --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@@ -787,25 -788,8 +787,6 @@@ extern int scsi_cmd_ioctl(struct reques extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); - /* - * A queue has just exitted congestion. Note this in the global counter of - * congested queues, and wake up anyone who was waiting for requests to be - * put back. - */ - static inline void blk_clear_queue_congested(struct request_queue *q, int sync) - { - clear_bdi_congested(&q->backing_dev_info, sync); - } - - /* - * A queue has just entered congestion. Flag that in the queue's VM-visible - * state flags and increment the global gounter of congested queues. - */ - static inline void blk_set_queue_congested(struct request_queue *q, int sync) - { - set_bdi_congested(&q->backing_dev_info, sync); - } -extern void blk_queue_bio(struct request_queue *q, struct bio *bio); -- extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); diff --cc mm/backing-dev.c index 000e7b3b9896,436bb53dd383..7756da31b02b --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@@ -387,49 -746,91 +746,74 @@@ int bdi_init(struct backing_dev_info *b bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; - spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->work_list); + init_waitqueue_head(&bdi->wb_waitq); - bdi_wb_init(&bdi->wb, bdi); + err = wb_init(&bdi->wb, bdi, GFP_KERNEL); + if (err) + return err; - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); - if (err) - goto err; - } + bdi->wb_congested.state = 0; + bdi->wb.congested = &bdi->wb_congested; - bdi->dirty_exceeded = 0; + cgwb_bdi_init(bdi); + return 0; + } + EXPORT_SYMBOL(bdi_init); - bdi->bw_time_stamp = jiffies; - bdi->written_stamp = 0; + int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) + { + va_list args; + struct device *dev; - bdi->balanced_dirty_ratelimit = INIT_BW; - bdi->dirty_ratelimit = INIT_BW; - bdi->write_bandwidth = INIT_BW; - bdi->avg_write_bandwidth = INIT_BW; + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; - err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) + return PTR_ERR(dev); - if (err) { - err: - while (i--) - percpu_counter_destroy(&bdi->bdi_stat[i]); - } + bdi->dev = dev; - return err; + bdi_debug_register(bdi, dev_name(dev)); + set_bit(WB_registered, &bdi->wb.state); + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; } - EXPORT_SYMBOL(bdi_init); + EXPORT_SYMBOL(bdi_register); - void bdi_destroy(struct backing_dev_info *bdi) + int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) { - int i; + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); + } + EXPORT_SYMBOL(bdi_register_dev); + + /* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ + static void bdi_remove_from_list(struct backing_dev_info *bdi) + { + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); - bdi_wb_shutdown(bdi); - bdi_set_min_ratio(bdi, 0); + synchronize_rcu_expedited(); + } - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); -/* - * Called when the device behind @bdi has been removed or ejected. - * - * We can't really do much here except for reducing the dirty ratio at - * the moment. In the future we should be able to set a flag so that - * the filesystem can handle errors at mark_inode_dirty time instead - * of only at writeback time. - */ -void bdi_unregister(struct backing_dev_info *bdi) -{ - if (WARN_ON_ONCE(!bdi->dev)) - return; - - bdi_set_min_ratio(bdi, 0); -} -EXPORT_SYMBOL(bdi_unregister); - + void bdi_destroy(struct backing_dev_info *bdi) + { + /* make sure nobody finds us on the bdi_list anymore */ + bdi_remove_from_list(bdi); + wb_shutdown(&bdi->wb); + cgwb_bdi_destroy(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); diff --cc mm/filemap.c index 8d17ceea8dbe,bfc1ab053b12..11f10efd637c --- a/mm/filemap.c +++ b/mm/filemap.c @@@ -485,15 -498,11 +500,16 @@@ int replace_page_cache_page(struct pag error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; - __inc_zone_page_state(new, NR_FILE_PAGES); + + /* + * hugetlb pages do not participate in page cache accounting. + */ + if (!PageHuge(new)) + __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) diff --cc mm/page-writeback.c index eb59f7eea508,e1514d5b4e9b..22cddd3e5de8 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@@ -802,27 -990,27 +990,27 @@@ static void wb_position_ratio(struct di * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ - bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); + wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* - * scale global setpoint to bdi's: - * bdi_setpoint = setpoint * bdi_thresh / thresh + * scale global setpoint to wb's: + * wb_setpoint = setpoint * wb_thresh / thresh */ - x = div_u64((u64)bdi_thresh << 16, thresh | 1); - bdi_setpoint = setpoint * (u64)x >> 16; - x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1); ++ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); + wb_setpoint = setpoint * (u64)x >> 16; /* - * Use span=(8*write_bw) in single bdi case as indicated by - * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. + * Use span=(8*write_bw) in single wb case as indicated by + * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * - * bdi_thresh thresh - bdi_thresh - * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh - * thresh thresh + * wb_thresh thresh - wb_thresh + * span = --------- * (8 * write_bw) + ------------------ * wb_thresh + * thresh thresh */ - span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; - x_intercept = bdi_setpoint + span; + span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = wb_setpoint + span; - if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), - (x_intercept - bdi_setpoint) | 1); + if (dtc->wb_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), - x_intercept - wb_setpoint + 1); ++ (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4;