Various cache line optimizations:
- Move delay_work towards the end. It's huge, and we don't use it
a lot (only SCSI).
- Move the atomic state into the same cacheline as the the dispatch
list and lock.
- Rearrange a few members to pack it better.
- Shrink the max-order for dispatch accounting from 10 to 7. This
means that ->dispatched[] and ->run now take up their own
cacheline.
This shrinks struct blk_mq_hw_ctx down to 8 cachelines.
Signed-off-by: Jens Axboe <axboe@fb.com>
struct {
spinlock_t lock;
struct list_head dispatch;
+ unsigned long state; /* BLK_MQ_S_* flags */
} ____cacheline_aligned_in_smp;
- unsigned long state; /* BLK_MQ_S_* flags */
struct work_struct run_work;
- struct delayed_work delay_work;
cpumask_var_t cpumask;
int next_cpu;
int next_cpu_batch;
struct blk_mq_ctxmap ctx_map;
- unsigned int nr_ctx;
struct blk_mq_ctx **ctxs;
+ unsigned int nr_ctx;
atomic_t wait_index;
unsigned long queued;
unsigned long run;
-#define BLK_MQ_MAX_DISPATCH_ORDER 10
+#define BLK_MQ_MAX_DISPATCH_ORDER 7
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
unsigned int numa_node;
atomic_t nr_active;
+ struct delayed_work delay_work;
+
struct blk_mq_cpu_notifier cpu_notifier;
struct kobject kobj;