block: improve struct request_queue layout

It's clearly been a while since someone looked at this, so I gave it a
quick shot. There are few issues in here:

- Random bundling of members that are mostly read-only and often written
- Random holes that need not be there

This moves the most frequently used bits into cacheline 1 and 2, with
the 2nd one being more write intensive than the first one, which is
basically read-only.

Outside of making this work a bit more efficiently, it also reduces the
size of struct request_queue for my test setup from 864 bytes (spanning
14 cachelines!) to 832 bytes and 13 cachelines.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/d2b7b61c-4868-45c0-9060-4f9c73de9d7e@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2023-12-14 11:08:15 -07:00
parent 6ef02df154
commit 0c734c5ea7

View File

@ -367,59 +367,51 @@ struct blk_independent_access_ranges {
};
struct request_queue {
struct request *last_merge;
struct elevator_queue *elevator;
struct percpu_ref q_usage_counter;
struct blk_queue_stats *stats;
struct rq_qos *rq_qos;
struct mutex rq_qos_mutex;
const struct blk_mq_ops *mq_ops;
/* sw queues */
struct blk_mq_ctx __percpu *queue_ctx;
unsigned int queue_depth;
/* hw dispatch queues */
struct xarray hctx_table;
unsigned int nr_hw_queues;
/*
* The queue owner gets to use this for whatever they like.
* ll_rw_blk doesn't touch it.
*/
void *queuedata;
struct elevator_queue *elevator;
const struct blk_mq_ops *mq_ops;
/* sw queues */
struct blk_mq_ctx __percpu *queue_ctx;
/*
* various queue flags, see QUEUE_* below
*/
unsigned long queue_flags;
/*
* Number of contexts that have called blk_set_pm_only(). If this
* counter is above zero then only RQF_PM requests are processed.
*/
atomic_t pm_only;
/*
* ida allocated id for this queue. Used to index queues from
* ioctx.
*/
int id;
unsigned int rq_timeout;
unsigned int queue_depth;
refcount_t refs;
/* hw dispatch queues */
unsigned int nr_hw_queues;
struct xarray hctx_table;
struct percpu_ref q_usage_counter;
struct request *last_merge;
spinlock_t queue_lock;
struct gendisk *disk;
int quiesce_depth;
refcount_t refs;
struct gendisk *disk;
/*
* mq queue kobject
*/
struct kobject *mq_kobj;
struct queue_limits limits;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity integrity;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
@ -429,25 +421,41 @@ struct request_queue {
enum rpm_status rpm_status;
#endif
/*
* Number of contexts that have called blk_set_pm_only(). If this
* counter is above zero then only RQF_PM requests are processed.
*/
atomic_t pm_only;
struct blk_queue_stats *stats;
struct rq_qos *rq_qos;
struct mutex rq_qos_mutex;
/*
* ida allocated id for this queue. Used to index queues from
* ioctx.
*/
int id;
unsigned int dma_pad_mask;
/*
* queue settings
*/
unsigned long nr_requests; /* Max # of requests */
unsigned int dma_pad_mask;
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct blk_crypto_profile *crypto_profile;
struct kobject *crypto_kobject;
#endif
unsigned int rq_timeout;
struct timer_list timeout;
struct work_struct timeout_work;
atomic_t nr_active_requests_shared_tags;
unsigned int required_elevator_features;
struct blk_mq_tags *sched_shared_tags;
struct list_head icq_list;
@ -458,11 +466,12 @@ struct request_queue {
struct mutex blkcg_mutex;
#endif
struct queue_limits limits;
unsigned int required_elevator_features;
int node;
spinlock_t requeue_lock;
struct list_head requeue_list;
struct delayed_work requeue_work;
#ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace __rcu *blk_trace;
#endif
@ -472,10 +481,6 @@ struct request_queue {
struct blk_flush_queue *fq;
struct list_head flush_list;
struct list_head requeue_list;
spinlock_t requeue_lock;
struct delayed_work requeue_work;
struct mutex sysfs_lock;
struct mutex sysfs_dir_lock;
@ -500,8 +505,6 @@ struct request_queue {
*/
struct mutex mq_freeze_lock;
int quiesce_depth;
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;