From c7c22e4d5c1fdebfac4dba76de7d0338c2b0d832 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Sep 2008 20:26:01 +0200 Subject: [PATCH] block: add support for IO CPU affinity This patch adds support for controlling the IO completion CPU of either all requests on a queue, or on a per-request basis. We export a sysfs variable (rq_affinity) which, if set, migrates completions of requests to the CPU that originally submitted it. A bio helper (bio_set_completion_cpu()) is also added, so that queuers can ask for completion on that specific CPU. In testing, this has been show to cut the system time by as much as 20-40% on synthetic workloads where CPU affinity is desired. This requires a little help from the architecture, so it'll only work as designed for archs that are using the new generic smp helper infrastructure. Signed-off-by: Jens Axboe --- block/blk-core.c | 46 ++++++++--------- block/blk-settings.c | 2 +- block/blk-softirq.c | 126 +++++++++++++++++++++++++++++++++++------------ block/blk-sysfs.c | 31 ++++++++++++ block/blk.h | 12 +++++ fs/bio.c | 1 + include/linux/bio.h | 11 +++++ include/linux/blkdev.h | 5 +- include/linux/elevator.h | 8 +-- 9 files changed, 182 insertions(+), 60 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9c6f818..5484838 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -110,7 +110,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); - INIT_LIST_HEAD(&rq->donelist); + rq->cpu = -1; rq->q = q; rq->sector = rq->hard_sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); @@ -322,6 +322,21 @@ void blk_unplug(struct request_queue *q) } EXPORT_SYMBOL(blk_unplug); +static void blk_invoke_request_fn(struct request_queue *q) +{ + /* + * one level of recursion is ok and is much faster than kicking + * the unplug handling + */ + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { + q->request_fn(q); + queue_flag_clear(QUEUE_FLAG_REENTER, q); + } else { + queue_flag_set(QUEUE_FLAG_PLUGGED, q); + kblockd_schedule_work(q, &q->unplug_work); + } +} + /** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question @@ -336,18 +351,7 @@ void blk_start_queue(struct request_queue *q) WARN_ON(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); - - /* - * one level of recursion is ok and is much faster than kicking - * the unplug handling - */ - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - blk_plug_device(q); - kblockd_schedule_work(q, &q->unplug_work); - } + blk_invoke_request_fn(q); } EXPORT_SYMBOL(blk_start_queue); @@ -405,15 +409,8 @@ void __blk_run_queue(struct request_queue *q) * Only recurse once to avoid overrunning the stack, let the unplug * handling reinvoke the handler shortly if we already got there. */ - if (!elv_queue_empty(q)) { - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - blk_plug_device(q); - kblockd_schedule_work(q, &q->unplug_work); - } - } + if (!elv_queue_empty(q)) + blk_invoke_request_fn(q); } EXPORT_SYMBOL(__blk_run_queue); @@ -1056,6 +1053,7 @@ EXPORT_SYMBOL(blk_put_request); void init_request_from_bio(struct request *req, struct bio *bio) { + req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; /* @@ -1198,13 +1196,15 @@ get_rq: init_request_from_bio(req, bio); spin_lock_irq(q->queue_lock); + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || + bio_flagged(bio, BIO_CPU_AFFINE)) + req->cpu = blk_cpu_to_group(smp_processor_id()); if (elv_queue_empty(q)) blk_plug_device(q); add_request(q, req); out: if (sync) __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); return 0; diff --git a/block/blk-settings.c b/block/blk-settings.c index d70692b..a60e959 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -443,7 +443,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); -static int __init blk_settings_init(void) +int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; blk_max_pfn = max_pfn - 1; diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 9e1c43b..3a1af55 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -13,6 +13,70 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); +/* + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +static void blk_done_softirq(struct softirq_action *h) +{ + struct list_head *cpu_list, local_list; + + local_irq_disable(); + cpu_list = &__get_cpu_var(blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, csd.list); + list_del_init(&rq->csd.list); + rq->q->softirq_done_fn(rq); + } +} + +#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +static void trigger_softirq(void *data) +{ + struct request *rq = data; + unsigned long flags; + struct list_head *list; + + local_irq_save(flags); + list = &__get_cpu_var(blk_cpu_done); + list_add_tail(&rq->csd.list, list); + + if (list->next == &rq->csd.list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); + + local_irq_restore(flags); +} + +/* + * Setup and invoke a run of 'trigger_softirq' on the given cpu. + */ +static int raise_blk_irq(int cpu, struct request *rq) +{ + if (cpu_online(cpu)) { + struct call_single_data *data = &rq->csd; + + data->func = trigger_softirq; + data->info = rq; + data->flags = 0; + + __smp_call_function_single(cpu, data); + return 0; + } + + return 1; +} +#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +static int raise_blk_irq(int cpu, struct request *rq) +{ + return 1; +} +#endif + static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -33,33 +97,10 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } - -static struct notifier_block blk_cpu_notifier __cpuinitdata = { +static struct notifier_block __cpuinitdata blk_cpu_notifier = { .notifier_call = blk_cpu_notify, }; -/* - * splice the completion data to a local structure and hand off to - * process_completion_queue() to complete the requests - */ -static void blk_done_softirq(struct softirq_action *h) -{ - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = &__get_cpu_var(blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, donelist); - list_del_init(&rq->donelist); - rq->q->softirq_done_fn(rq); - } -} - /** * blk_complete_request - end I/O on a request * @req: the request being processed @@ -71,25 +112,48 @@ static void blk_done_softirq(struct softirq_action *h) * through a softirq handler. The user must have registered a completion * callback through blk_queue_softirq_done(). **/ - void blk_complete_request(struct request *req) { - struct list_head *cpu_list; + struct request_queue *q = req->q; unsigned long flags; + int ccpu, cpu, group_cpu; - BUG_ON(!req->q->softirq_done_fn); + BUG_ON(!q->softirq_done_fn); local_irq_save(flags); + cpu = smp_processor_id(); + group_cpu = blk_cpu_to_group(cpu); - cpu_list = &__get_cpu_var(blk_cpu_done); - list_add_tail(&req->donelist, cpu_list); - raise_softirq_irqoff(BLOCK_SOFTIRQ); + /* + * Select completion CPU + */ + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) + ccpu = req->cpu; + else + ccpu = cpu; + + if (ccpu == cpu || ccpu == group_cpu) { + struct list_head *list; +do_local: + list = &__get_cpu_var(blk_cpu_done); + list_add_tail(&req->csd.list, list); + + /* + * if the list only contains our just added request, + * signal a raise of the softirq. If there are already + * entries there, someone already raised the irq but it + * hasn't run yet. + */ + if (list->next == &req->csd.list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); + } else if (raise_blk_irq(ccpu, req)) + goto do_local; local_irq_restore(flags); } EXPORT_SYMBOL(blk_complete_request); -int __init blk_softirq_init(void) +__init int blk_softirq_init(void) { int i; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b9a6ed1..21e275d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) +{ + unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + + return queue_var_show(set != 0, page); +} + +static ssize_t +queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) +{ + ssize_t ret = -EINVAL; +#if defined(CONFIG_USE_GENERIC_SMP_HELPERS) + unsigned long val; + + ret = queue_var_store(&val, page, count); + spin_lock_irq(q->queue_lock); + if (val) + queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + else + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + spin_unlock_irq(q->queue_lock); +#endif + return ret; +} static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, @@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_nomerges_entry = { .store = queue_nomerges_store, }; +static struct queue_sysfs_entry queue_rq_affinity_entry = { + .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR }, + .show = queue_rq_affinity_show, + .store = queue_rq_affinity_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -205,6 +235,7 @@ static struct attribute *default_attrs[] = { &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_nomerges_entry.attr, + &queue_rq_affinity_entry.attr, NULL, }; diff --git a/block/blk.h b/block/blk.h index c79f30e..de74254 100644 --- a/block/blk.h +++ b/block/blk.h @@ -59,4 +59,16 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) #endif /* BLK_DEV_INTEGRITY */ +static inline int blk_cpu_to_group(int cpu) +{ +#ifdef CONFIG_SCHED_MC + cpumask_t mask = cpu_coregroup_map(cpu); + return first_cpu(mask); +#elif defined(CONFIG_SCHED_SMT) + return first_cpu(per_cpu(cpu_sibling_map, cpu)); +#else + return cpu; +#endif +} + #endif diff --git a/fs/bio.c b/fs/bio.c index bee4dec..6a637b5 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -111,6 +111,7 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 2c0c090..13aba20 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -81,6 +81,8 @@ struct bio { unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ + unsigned int bi_comp_cpu; /* completion CPU */ + struct bio_vec *bi_io_vec; /* the actual vec list */ bio_end_io_t *bi_end_io; @@ -105,6 +107,7 @@ struct bio { #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_EOPNOTSUPP 7 /* not supported */ +#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* @@ -343,6 +346,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set extern unsigned int bvec_nr_vecs(unsigned short idx); /* + * Allow queuer to specify a completion CPU for this bio + */ +static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) +{ + bio->bi_comp_cpu = cpu; +} + +/* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. * These memory pools in turn all allocate from the bio_slab diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 10aa46c..93204bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -139,7 +140,8 @@ enum rq_flag_bits { */ struct request { struct list_head queuelist; - struct list_head donelist; + struct call_single_data csd; + int cpu; struct request_queue *q; @@ -420,6 +422,7 @@ struct request_queue #define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ +#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ static inline int queue_is_locked(struct request_queue *q) { diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 639624b..bb791c3 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -173,15 +173,15 @@ enum { #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) /* - * Hack to reuse the donelist list_head as the fifo time holder while + * Hack to reuse the csd.list list_head as the fifo time holder while * the request is in the io scheduler. Saves an unsigned long in rq. */ -#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next) -#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp)) +#define rq_fifo_time(rq) ((unsigned long) (rq)->csd.list.next) +#define rq_set_fifo_time(rq,exp) ((rq)->csd.list.next = (void *) (exp)) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) do { \ list_del_init(&(rq)->queuelist); \ - INIT_LIST_HEAD(&(rq)->donelist); \ + INIT_LIST_HEAD(&(rq)->csd.list); \ } while (0) /* -- 1.8.2.3