From aefd9d714d5dc65b748e4f226475537ba6e00e86 Mon Sep 17 00:00:00 2001 From: Li Xi Date: Mon, 14 Sep 2015 18:41:32 -0400 Subject: [PATCH] staging/lustre/osc: use global osc_rq_pool to reduce memory usage The per-osc request pools consume a lot of memory if there are hundreds of OSCs on one client. This will be a critical problem if the client doesn't have sufficient memory for both OSCs and applications. This patch replaces per-osc request pools with a global pool osc_rq_pool. The total memory usage is 5MB by default. And it can be set by a module parameter of OSC: "options osc osc_reqpool_mem_max=POOL_SIZE". The unit of POOL_SIZE is MB. If cl_max_rpcs_in_flight is the same for all OSCs, the memory usage of the OSC pool can be calculated as: Min(POOL_SIZE * 1M, (cl_max_rpcs_in_flight + 2) * OSC number * OST_MAXREQSIZE) Also, this patch changes the allocation logic of OSC write requests. The allocation from osc_rq_pool will only be tried after normal allocation failed. Signed-off-by: Wu Libin Signed-off-by: Wang Shilong Signed-off-by: Li Xi Reviewed-on: http://review.whamcloud.com/15422 Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-6770 Reviewed-by: Jinshan Xiong Reviewed-by: Andreas Dilger Signed-off-by: Oleg Drokin Signed-off-by: Greg Kroah-Hartman --- .../lustre/lustre/include/lustre_import.h | 2 - .../lustre/lustre/include/lustre_net.h | 7 +- .../staging/lustre/lustre/include/obd_class.h | 4 - .../staging/lustre/lustre/obdclass/genops.c | 1 - drivers/staging/lustre/lustre/osc/lproc_osc.c | 17 +++- .../staging/lustre/lustre/osc/osc_internal.h | 4 + .../staging/lustre/lustre/osc/osc_request.c | 79 +++++++++++++++---- drivers/staging/lustre/lustre/ptlrpc/client.c | 21 ++--- 8 files changed, 94 insertions(+), 41 deletions(-) diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h index 5a38f3d5e011..c8b89a359518 100644 --- a/drivers/staging/lustre/lustre/include/lustre_import.h +++ b/drivers/staging/lustre/lustre/include/lustre_import.h @@ -306,8 +306,6 @@ struct obd_import { __u32 imp_msg_magic; __u32 imp_msghdr_flags; /* adjusted based on server capability */ - struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ - struct imp_at imp_at; /* adaptive timeout data */ time_t imp_last_reply_time; /* for health check */ }; diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h index 5df493e90040..313a56c2bfa0 100644 --- a/drivers/staging/lustre/lustre/include/lustre_net.h +++ b/drivers/staging/lustre/lustre/include/lustre_net.h @@ -500,7 +500,7 @@ struct ptlrpc_request_pool { /** Maximum message size that would fit into a request from this pool */ int prp_rq_size; /** Function to allocate more requests for this pool */ - void (*prp_populate)(struct ptlrpc_request_pool *, int); + int (*prp_populate)(struct ptlrpc_request_pool *, int); }; struct lu_context; @@ -2381,11 +2381,11 @@ void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, struct ptlrpc_request *req); void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); -void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); struct ptlrpc_request_pool * ptlrpc_init_rq_pool(int, int, - void (*populate_pool)(struct ptlrpc_request_pool *, int)); + int (*populate_pool)(struct ptlrpc_request_pool *, int)); void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, @@ -2957,7 +2957,6 @@ void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes); /* ptlrpc/llog_client.c */ extern struct llog_operations llog_client_ops; - /** @} net */ #endif diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h index 87bb2cedca7d..ce6fa55aded7 100644 --- a/drivers/staging/lustre/lustre/include/obd_class.h +++ b/drivers/staging/lustre/lustre/include/obd_class.h @@ -626,10 +626,6 @@ static inline void obd_cleanup_client_import(struct obd_device *obd) CDEBUG(D_CONFIG, "%s: client import never connected\n", obd->obd_name); ptlrpc_invalidate_import(imp); - if (imp->imp_rq_pool) { - ptlrpc_free_rq_pool(imp->imp_rq_pool); - imp->imp_rq_pool = NULL; - } client_destroy_import(imp); obd->u.cli.cl_import = NULL; } diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c index 370d5b4af19b..594955d359ec 100644 --- a/drivers/staging/lustre/lustre/obdclass/genops.c +++ b/drivers/staging/lustre/lustre/obdclass/genops.c @@ -1663,7 +1663,6 @@ static void obd_zombie_export_add(struct obd_export *exp) static void obd_zombie_import_add(struct obd_import *imp) { LASSERT(!imp->imp_sec); - LASSERT(!imp->imp_rq_pool); spin_lock(&obd_zombie_impexp_lock); LASSERT(list_empty(&imp->imp_zombie_chain)); zombies_count++; diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c index ff6d2e2ffdab..c504d1503fda 100644 --- a/drivers/staging/lustre/lustre/osc/lproc_osc.c +++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c @@ -96,9 +96,9 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, struct obd_device *dev = container_of(kobj, struct obd_device, obd_kobj); struct client_obd *cli = &dev->u.cli; - struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool; int rc; unsigned long val; + int adding, added, req_count; rc = kstrtoul(buffer, 10, &val); if (rc) @@ -107,8 +107,19 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, if (val < 1 || val > OSC_MAX_RIF_MAX) return -ERANGE; - if (pool && val > cli->cl_max_rpcs_in_flight) - pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight); + adding = val - cli->cl_max_rpcs_in_flight; + req_count = atomic_read(&osc_pool_req_count); + if (adding > 0 && req_count < osc_reqpool_maxreqcount) { + /* + * There might be some race which will cause over-limit + * allocation, but it is fine. + */ + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = osc_rq_pool->prp_populate(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } client_obd_list_lock(&cli->cl_loi_list_lock); cli->cl_max_rpcs_in_flight = val; diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h index 470698b0dd75..7d0a3e2db289 100644 --- a/drivers/staging/lustre/lustre/osc/osc_internal.h +++ b/drivers/staging/lustre/lustre/osc/osc_internal.h @@ -39,6 +39,10 @@ #define OAP_MAGIC 8675309 +extern atomic_t osc_pool_req_count; +extern unsigned int osc_reqpool_maxreqcount; +extern struct ptlrpc_request_pool *osc_rq_pool; + struct lu_env; enum async_flags { diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c index 114c5508a0a0..f41f762573f5 100644 --- a/drivers/staging/lustre/lustre/osc/osc_request.c +++ b/drivers/staging/lustre/lustre/osc/osc_request.c @@ -50,9 +50,18 @@ #include "../include/lustre_param.h" #include "../include/lustre_fid.h" #include "../include/obd_class.h" +#include "../include/obd.h" #include "osc_internal.h" #include "osc_cl_internal.h" +atomic_t osc_pool_req_count; +unsigned int osc_reqpool_maxreqcount; +struct ptlrpc_request_pool *osc_rq_pool; + +/* max memory used for request pool, unit is MB */ +static unsigned int osc_reqpool_mem_max = 5; +module_param(osc_reqpool_mem_max, uint, 0444); + struct osc_brw_async_args { struct obdo *aa_oa; int aa_requested_nob; @@ -1268,7 +1277,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli, if ((cmd & OBD_BRW_WRITE) != 0) { opc = OST_WRITE; req = ptlrpc_request_alloc_pool(cli->cl_import, - cli->cl_import->imp_rq_pool, + osc_rq_pool, &RQF_OST_BRW_WRITE); } else { opc = OST_READ; @@ -3163,6 +3172,9 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) struct client_obd *cli = &obd->u.cli; void *handler; int rc; + int adding; + int added; + int req_count; rc = ptlrpcd_addref(); if (rc) @@ -3191,15 +3203,20 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) ptlrpc_lprocfs_register_obd(obd); } - /* We need to allocate a few requests more, because - * brw_interpret tries to create new requests before freeing - * previous ones, Ideally we want to have 2x max_rpcs_in_flight - * reserved, but I'm afraid that might be too much wasted RAM - * in fact, so 2 is just my guess and still should work. */ - cli->cl_import->imp_rq_pool = - ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, - OST_MAXREQSIZE, - ptlrpc_add_rqs_to_pool); + /* + * We try to control the total number of requests with a upper limit + * osc_reqpool_maxreqcount. There might be some race which will cause + * over-limit allocation, but it is fine. + */ + req_count = atomic_read(&osc_pool_req_count); + if (req_count < osc_reqpool_maxreqcount) { + adding = cli->cl_max_rpcs_in_flight + 2; + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } INIT_LIST_HEAD(&cli->cl_grant_shrink_list); ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery); @@ -3339,6 +3356,8 @@ extern struct lock_class_key osc_ast_guard_class; static int __init osc_init(void) { struct lprocfs_static_vars lvars = { NULL }; + unsigned int reqpool_size; + unsigned int reqsize; int rc; /* print an address of _any_ initialized kernel symbol from this @@ -3354,14 +3373,45 @@ static int __init osc_init(void) rc = class_register_type(&osc_obd_ops, NULL, LUSTRE_OSC_NAME, &osc_device_type); - if (rc) { - lu_kmem_fini(osc_caches); - return rc; - } + if (rc) + goto out_kmem; spin_lock_init(&osc_ast_guard); lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); + /* This is obviously too much memory, only prevent overflow here */ + if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) { + rc = -EINVAL; + goto out_type; + } + + reqpool_size = osc_reqpool_mem_max << 20; + + reqsize = 1; + while (reqsize < OST_MAXREQSIZE) + reqsize = reqsize << 1; + + /* + * We don't enlarge the request count in OSC pool according to + * cl_max_rpcs_in_flight. The allocation from the pool will only be + * tried after normal allocation failed. So a small OSC pool won't + * cause much performance degression in most of cases. + */ + osc_reqpool_maxreqcount = reqpool_size / reqsize; + + atomic_set(&osc_pool_req_count, 0); + osc_rq_pool = ptlrpc_init_rq_pool(0, OST_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + if (osc_rq_pool) + return 0; + + rc = -ENOMEM; + +out_type: + class_unregister_type(LUSTRE_OSC_NAME); +out_kmem: + lu_kmem_fini(osc_caches); return rc; } @@ -3369,6 +3419,7 @@ static void /*__exit*/ osc_exit(void) { class_unregister_type(LUSTRE_OSC_NAME); lu_kmem_fini(osc_caches); + ptlrpc_free_rq_pool(osc_rq_pool); } MODULE_AUTHOR("Sun Microsystems, Inc. "); diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c index 1800db1e0797..90b24fc584ab 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/client.c +++ b/drivers/staging/lustre/lustre/ptlrpc/client.c @@ -446,7 +446,7 @@ EXPORT_SYMBOL(ptlrpc_free_rq_pool); /** * Allocates, initializes and adds \a num_rq requests to the pool \a pool */ -void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) { int i; int size = 1; @@ -468,11 +468,11 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) spin_unlock(&pool->prp_lock); req = ptlrpc_request_cache_alloc(GFP_NOFS); if (!req) - return; + return i; msg = libcfs_kvzalloc(size, GFP_NOFS); if (!msg) { ptlrpc_request_cache_free(req); - return; + return i; } req->rq_reqbuf = msg; req->rq_reqbuf_len = size; @@ -481,6 +481,7 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) list_add_tail(&req->rq_list, &pool->prp_req_list); } spin_unlock(&pool->prp_lock); + return num_rq; } EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); @@ -494,7 +495,7 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); */ struct ptlrpc_request_pool * ptlrpc_init_rq_pool(int num_rq, int msgsize, - void (*populate_pool)(struct ptlrpc_request_pool *, int)) + int (*populate_pool)(struct ptlrpc_request_pool *, int)) { struct ptlrpc_request_pool *pool; @@ -512,11 +513,6 @@ ptlrpc_init_rq_pool(int num_rq, int msgsize, populate_pool(pool, num_rq); - if (list_empty(&pool->prp_req_list)) { - /* have not allocated a single request for the pool */ - kfree(pool); - pool = NULL; - } return pool; } EXPORT_SYMBOL(ptlrpc_init_rq_pool); @@ -702,11 +698,10 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, { struct ptlrpc_request *request = NULL; - if (pool) - request = ptlrpc_prep_req_from_pool(pool); + request = ptlrpc_request_cache_alloc(GFP_NOFS); - if (!request) - request = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!request && pool) + request = ptlrpc_prep_req_from_pool(pool); if (request) { LASSERTF((unsigned long)imp > 0x1000, "%p", imp); -- 2.20.1