From 528ee9fbf0244406a76cb5e37406eef303b09a46 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Sat, 5 Mar 2016 08:50:43 -0800 Subject: [PATCH] IB/hfi1: Add adaptive cacheless verbs copy The kernel memcpy is faster than a cacheless copy. However, if too much of the L3 cache is overwritten by one-time copies then overall bandwidth suffers. Implement an adaptive scheme where full page copies are tracked and if the number of unique entries are larger than a threshold, verbs will use a cacheless copy. Tracked entries are gradually cleaned, allowing memcpy to resume once the larger copies have stopped. Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Dean Luick Signed-off-by: Jubin John Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/init.c | 6 + drivers/staging/rdma/hfi1/verbs.c | 185 +++++++++++++++++++++++++++++- drivers/staging/rdma/hfi1/verbs.h | 22 ++++ 3 files changed, 211 insertions(+), 2 deletions(-) diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index f21933ca93ce..deabb0812023 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -1242,6 +1242,9 @@ static int __init hfi1_mod_init(void) idr_init(&hfi1_unit_table); hfi1_dbg_init(); + ret = hfi1_wss_init(); + if (ret < 0) + goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1250,6 +1253,8 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: + hfi1_wss_exit(); +bail_wss: hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); dev_cleanup(); @@ -1265,6 +1270,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + hfi1_wss_exit(); hfi1_dbg_exit(); hfi1_cpulist_count = 0; kfree(hfi1_cpulist); diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c index 220bdb0b70bc..82097571aa85 100644 --- a/drivers/staging/rdma/hfi1/verbs.c +++ b/drivers/staging/rdma/hfi1/verbs.c @@ -125,6 +125,13 @@ unsigned short piothreshold; module_param(piothreshold, ushort, S_IRUGO); MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); +#define COPY_CACHELESS 1 +#define COPY_ADAPTIVE 2 +static unsigned int sge_copy_mode; +module_param(sge_copy_mode, uint, S_IRUGO); +MODULE_PARM_DESC(sge_copy_mode, + "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS"); + static void verbs_sdma_complete( struct sdma_txreq *cookie, int status); @@ -137,6 +144,159 @@ static int pio_wait(struct rvt_qp *qp, /* Length of buffer to create verbs txreq cache name */ #define TXREQ_NAME_LEN 24 +static uint wss_threshold; +module_param(wss_threshold, uint, S_IRUGO); +MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); +static uint wss_clean_period = 256; +module_param(wss_clean_period, uint, S_IRUGO); +MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); + +/* memory working set size */ +struct hfi1_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; +}; + +static struct hfi1_wss wss; + +int hfi1_wss_init(void) +{ + long llc_size; + long llc_bits; + long table_size; + long table_bits; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss.pages_mask = table_bits - 1; + wss.num_entries = table_bits / BITS_PER_LONG; + + wss.threshold = (llc_bits * wss_threshold) / 100; + if (wss.threshold == 0) + wss.threshold = 1; + + atomic_set(&wss.clean_counter, wss_clean_period); + + wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), + GFP_KERNEL); + if (!wss.entries) { + hfi1_wss_exit(); + return -ENOMEM; + } + + return 0; +} + +void hfi1_wss_exit(void) +{ + /* coded to handle partially initialized and repeat callers */ + kfree(wss.entries); + wss.entries = NULL; +} + +/* + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. + */ +static void wss_advance_clean_counter(void) +{ + int entry; + int weight; + unsigned long bits; + + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss.clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss.clean_counter, wss_clean_period); + + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss.clean_entry) - 1) + & (wss.num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss.entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss.total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(void *address) +{ + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss.entries[entry])) + atomic_inc(&wss.total_count); + + wss_advance_clean_counter(); +} + +/* + * Is the working set larger than the threshold? + */ +static inline int wss_exceeds_threshold(void) +{ + return atomic_read(&wss.total_count) >= wss.threshold; +} + /* * Translate ib_wr_opcode into ib_wc_opcode. */ @@ -258,7 +418,26 @@ void hfi1_copy_sge( struct rvt_sge *sge = &ss->sge; int in_last = 0; int i; + int cacheless_copy = 0; + if (sge_copy_mode == COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(sge->vaddr + PAGE_SIZE); + + cacheless_copy = wss_exceeds_threshold(); + } else { + wss_advance_clean_counter(); + } + } if (copy_last) { if (length > 8) { length -= 8; @@ -277,10 +456,12 @@ again: if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - if (in_last) { - /* enforce byte transer ordering */ + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ for (i = 0; i < len; i++) ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); } else { memcpy(sge->vaddr, data, len); } diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h index a85e6bc580b6..6c4670fffdbb 100644 --- a/drivers/staging/rdma/hfi1/verbs.h +++ b/drivers/staging/rdma/hfi1/verbs.h @@ -475,6 +475,28 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); +int hfi1_wss_init(void); +void hfi1_wss_exit(void); + +/* platform specific: return the lowest level cache (llc) size, in KiB */ +static inline int wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static inline void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} + extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; extern const u8 hdr_len_by_opcode[]; -- 2.20.1