From: Ann Koehler Date: Sun, 29 Jan 2017 00:04:39 +0000 (-0500) Subject: staging: lustre: obd: RCU stalls in lu_cache_shrink_count() X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=e202d55b05f29f7d17548167b84764bb951b5b90;p=GitHub%2Fmoto-9609%2Fandroid_kernel_motorola_exynos9610.git staging: lustre: obd: RCU stalls in lu_cache_shrink_count() The algorithm for counting freeable objects in the lu_cache shrinker does not scale with the number of cpus. The LU_SS_LRU_LEN counter for each cpu is read and summed at shrink time while holding the lu_sites_guard mutex. With a large number of cpus and low memory conditions, processes bottleneck on the mutex. This mod reduces the time spent counting by using the kernel's percpu counter functions to maintain the length of a site's lru. The summing occurs when a percpu value is incremented or decremented and a threshold is exceeded. lu_cache_shrink_count() simply returns the last such computed sum. This mod also replaces the lu_sites_guard mutex with a rw semaphore. The lock protects the lu_site list, which is modified when a file system is mounted/umounted or when the lu_site is purged. lu_cache_shrink_count simply reads data so it does not need to wait for other readers. lu_cache_shrink_scan, which actually frees the unused objects, is still serialized. Signed-off-by: Ann Koehler Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-7997 Reviewed-on: http://review.whamcloud.com/19390 Reviewed-by: Andreas Dilger Reviewed-by: Bobi Jam Reviewed-by: Oleg Drokin Signed-off-by: James Simmons Signed-off-by: Greg Kroah-Hartman --- diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h index f283a036a748..0826c1956ba8 100644 --- a/drivers/staging/lustre/lustre/include/lu_object.h +++ b/drivers/staging/lustre/lustre/include/lu_object.h @@ -34,6 +34,7 @@ #define __LUSTRE_LU_OBJECT_H #include +#include #include "../../include/linux/libcfs/libcfs.h" #include "lustre/lustre_idl.h" #include "lu_ref.h" @@ -580,7 +581,6 @@ enum { LU_SS_CACHE_RACE, LU_SS_CACHE_DEATH_RACE, LU_SS_LRU_PURGED, - LU_SS_LRU_LEN, /* # of objects in lsb_lru lists */ LU_SS_LAST_STAT }; @@ -635,6 +635,10 @@ struct lu_site { * XXX: a hack! fld has to find md_site via site, remove when possible */ struct seq_server_site *ld_seq_site; + /** + * Number of objects in lsb_lru_lists - used for shrinking + */ + struct percpu_counter ls_lru_len_counter; }; static inline struct lu_site_bkt_data * diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c index 7971562a3efd..18058615bd96 100644 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -151,7 +151,7 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o) LASSERT(list_empty(&top->loh_lru)); list_add_tail(&top->loh_lru, &bkt->lsb_lru); bkt->lsb_lru_len++; - lprocfs_counter_incr(site->ls_stats, LU_SS_LRU_LEN); + percpu_counter_inc(&site->ls_lru_len_counter); CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, lru_len: %ld\n", o, site->ls_obj_hash, bkt, bkt->lsb_lru_len); cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); @@ -202,7 +202,7 @@ void lu_object_unhash(const struct lu_env *env, struct lu_object *o) list_del_init(&top->loh_lru); bkt = cfs_hash_bd_extra_get(obj_hash, &bd); bkt->lsb_lru_len--; - lprocfs_counter_decr(site->ls_stats, LU_SS_LRU_LEN); + percpu_counter_dec(&site->ls_lru_len_counter); } cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash); cfs_hash_bd_unlock(obj_hash, &bd, 1); @@ -379,7 +379,7 @@ int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr) &bd2, &h->loh_hash); list_move(&h->loh_lru, &dispose); bkt->lsb_lru_len--; - lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN); + percpu_counter_dec(&s->ls_lru_len_counter); if (did_sth == 0) did_sth = 1; @@ -578,7 +578,7 @@ static struct lu_object *htable_lookup(struct lu_site *s, if (!list_empty(&h->loh_lru)) { list_del_init(&h->loh_lru); bkt->lsb_lru_len--; - lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN); + percpu_counter_dec(&s->ls_lru_len_counter); } return lu_object_top(h); } @@ -820,7 +820,7 @@ EXPORT_SYMBOL(lu_device_type_fini); * Global list of all sites on this node */ static LIST_HEAD(lu_sites); -static DEFINE_MUTEX(lu_sites_guard); +static DECLARE_RWSEM(lu_sites_guard); /** * Global environment used by site shrinker. @@ -994,9 +994,15 @@ int lu_site_init(struct lu_site *s, struct lu_device *top) unsigned long bits; unsigned long i; char name[16]; + int rc; memset(s, 0, sizeof(*s)); mutex_init(&s->ls_purge_mutex); + + rc = percpu_counter_init(&s->ls_lru_len_counter, 0, GFP_NOFS); + if (rc) + return -ENOMEM; + snprintf(name, sizeof(name), "lu_site_%s", top->ld_type->ldt_name); for (bits = lu_htable_order(top); bits >= LU_SITE_BITS_MIN; bits--) { s->ls_obj_hash = cfs_hash_create(name, bits, bits, @@ -1042,12 +1048,6 @@ int lu_site_init(struct lu_site *s, struct lu_device *top) 0, "cache_death_race", "cache_death_race"); lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, 0, "lru_purged", "lru_purged"); - /* - * Unlike other counters, lru_len can be decremented so - * need lc_sum instead of just lc_count - */ - lprocfs_counter_init(s->ls_stats, LU_SS_LRU_LEN, - LPROCFS_CNTR_AVGMINMAX, "lru_len", "lru_len"); INIT_LIST_HEAD(&s->ls_linkage); s->ls_top_dev = top; @@ -1069,9 +1069,11 @@ EXPORT_SYMBOL(lu_site_init); */ void lu_site_fini(struct lu_site *s) { - mutex_lock(&lu_sites_guard); + down_write(&lu_sites_guard); list_del_init(&s->ls_linkage); - mutex_unlock(&lu_sites_guard); + up_write(&lu_sites_guard); + + percpu_counter_destroy(&s->ls_lru_len_counter); if (s->ls_obj_hash) { cfs_hash_putref(s->ls_obj_hash); @@ -1097,11 +1099,11 @@ int lu_site_init_finish(struct lu_site *s) { int result; - mutex_lock(&lu_sites_guard); + down_write(&lu_sites_guard); result = lu_context_refill(&lu_shrink_env.le_ctx); if (result == 0) list_add(&s->ls_linkage, &lu_sites); - mutex_unlock(&lu_sites_guard); + up_write(&lu_sites_guard); return result; } EXPORT_SYMBOL(lu_site_init_finish); @@ -1820,12 +1822,15 @@ static void lu_site_stats_get(struct cfs_hash *hs, } /* - * lu_cache_shrink_count returns the number of cached objects that are - * candidates to be freed by shrink_slab(). A counter, which tracks - * the number of items in the site's lru, is maintained in the per cpu - * stats of each site. The counter is incremented when an object is added - * to a site's lru and decremented when one is removed. The number of - * free-able objects is the sum of all per cpu counters for all sites. + * lu_cache_shrink_count() returns an approximate number of cached objects + * that can be freed by shrink_slab(). A counter, which tracks the + * number of items in the site's lru, is maintained in a percpu_counter + * for each site. The percpu values are incremented and decremented as + * objects are added or removed from the lru. The percpu values are summed + * and saved whenever a percpu value exceeds a threshold. Thus the saved, + * summed value at any given time may not accurately reflect the current + * lru length. But this value is sufficiently accurate for the needs of + * a shrinker. * * Using a per cpu counter is a compromise solution to concurrent access: * lu_object_put() can update the counter without locking the site and @@ -1842,11 +1847,10 @@ static unsigned long lu_cache_shrink_count(struct shrinker *sk, if (!(sc->gfp_mask & __GFP_FS)) return 0; - mutex_lock(&lu_sites_guard); - list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { - cached += ls_stats_read(s->ls_stats, LU_SS_LRU_LEN); - } - mutex_unlock(&lu_sites_guard); + down_read(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) + cached += percpu_counter_read_positive(&s->ls_lru_len_counter); + up_read(&lu_sites_guard); cached = (cached / 100) * sysctl_vfs_cache_pressure; CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n", @@ -1877,7 +1881,7 @@ static unsigned long lu_cache_shrink_scan(struct shrinker *sk, */ return SHRINK_STOP; - mutex_lock(&lu_sites_guard); + down_write(&lu_sites_guard); list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { freed = lu_site_purge(&lu_shrink_env, s, remain); remain -= freed; @@ -1888,7 +1892,7 @@ static unsigned long lu_cache_shrink_scan(struct shrinker *sk, list_move_tail(&s->ls_linkage, &splice); } list_splice(&splice, lu_sites.prev); - mutex_unlock(&lu_sites_guard); + up_write(&lu_sites_guard); return sc->nr_to_scan - remain; } @@ -1925,9 +1929,9 @@ int lu_global_init(void) * conservatively. This should not be too bad, because this * environment is global. */ - mutex_lock(&lu_sites_guard); + down_write(&lu_sites_guard); result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); - mutex_unlock(&lu_sites_guard); + up_write(&lu_sites_guard); if (result != 0) return result; @@ -1953,9 +1957,9 @@ void lu_global_fini(void) * Tear shrinker environment down _after_ de-registering * lu_global_key, because the latter has a value in the former. */ - mutex_lock(&lu_sites_guard); + down_write(&lu_sites_guard); lu_env_fini(&lu_shrink_env); - mutex_unlock(&lu_sites_guard); + up_write(&lu_sites_guard); lu_ref_global_fini(); } @@ -1965,13 +1969,6 @@ static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx) struct lprocfs_counter ret; lprocfs_stats_collect(stats, idx, &ret); - if (idx == LU_SS_LRU_LEN) - /* - * protect against counter on cpu A being decremented - * before counter is incremented on cpu B; unlikely - */ - return (__u32)((ret.lc_sum > 0) ? ret.lc_sum : 0); - return (__u32)ret.lc_count; } @@ -1986,7 +1983,7 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m) memset(&stats, 0, sizeof(stats)); lu_site_stats_get(s->ls_obj_hash, &stats, 1); - seq_printf(m, "%d/%d %d/%ld %d %d %d %d %d %d %d %d\n", + seq_printf(m, "%d/%d %d/%ld %d %d %d %d %d %d %d\n", stats.lss_busy, stats.lss_total, stats.lss_populated, @@ -1997,8 +1994,7 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m) ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS), ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE), ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE), - ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED), - ls_stats_read(s->ls_stats, LU_SS_LRU_LEN)); + ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED)); return 0; } EXPORT_SYMBOL(lu_site_stats_print);