From: Jeff Moyer <jmoyer@redhat.com> Date: Tue, 3 Nov 2009 11:36:36 -0500 Subject: [block] cfq-iosched: add close cooperator code Message-id: 1257266206-24003-3-git-send-email-jmoyer@redhat.com O-Subject: [PATCH 02/12] cfq-iosched: add close cooperator code Bugzilla: 456181 448130 427709 RH-Acked-by: Rik van Riel <riel@redhat.com> RH-Acked-by: Josef Bacik <josef@redhat.com> RH-Acked-by: Vivek Goyal <vgoyal@redhat.com> RH-Acked-by: Jeff Layton <jlayton@redhat.com> This is a backport of commit a36e71f996e25d6213f57951f7ae1874086ec57e: If we have processes that are working in close proximity to each other on disk, we don't want to idle wait. Instead allow the close process to issue a request, getting better aggregate bandwidth. The anticipatory scheduler has similar checks, noop and deadline do not need it since they don't care about process <-> io mappings. The code for CFQ is a little more involved though, since we split request queues into per-process contexts. This fixes a performance problem with eg dump(8), since it uses several processes in some silly attempt to speed IO up. Even if dump(8) isn't really a valid case (it should be fixed by using CLONE_IO), there are other cases where we see close processes and where idling ends up hurting performance. Credit goes to Jeff Moyer <jmoyer@redhat.com> for writing the initial implementation. Signed-off-by: Jens Axboe <jens.axboe@oracle.com> Signed-off-by: Jeff Moyer <jmoyer@redhat.com> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f7c069e..3772eea 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -100,6 +100,14 @@ struct cfq_data { struct list_head busy_rr; struct list_head cur_rr; struct list_head idle_rr; + + /* + * Each priority tree is sorted by next_request position. These + * trees are used when determining if two or more queues are + * interleaving requests (see cfq_close_cooperator). + */ + struct rb_root prio_trees[CFQ_PRIO_LISTS]; + unsigned int busy_queues; /* @@ -172,6 +180,8 @@ struct cfq_queue { unsigned int key; /* on either rr or empty list of cfqd */ struct list_head cfq_list; + /* prio tree member */ + struct rb_node p_node; /* sorted list of pending requests */ struct rb_root sort_list; /* if fifo isn't expired, next request to serve */ @@ -220,6 +230,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_fifo_expire, CFQ_CFQQ_FLAG_idle_window, CFQ_CFQQ_FLAG_prio_changed, + CFQ_CFQQ_FLAG_coop, }; #define CFQ_CFQQ_FNS(name) \ @@ -244,6 +255,7 @@ CFQ_CFQQ_FNS(must_dispatch); CFQ_CFQQ_FNS(fifo_expire); CFQ_CFQQ_FNS(idle_window); CFQ_CFQQ_FNS(prio_changed); +CFQ_CFQQ_FNS(coop); #undef CFQ_CFQQ_FNS enum cfq_rq_state_flags { @@ -457,6 +469,71 @@ static void cfq_update_next_crq(struct cfq_rq *crq) cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); } + +static struct cfq_queue * +cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector, + struct rb_node **ret_parent, struct rb_node ***rb_link) +{ + struct rb_root *root = &cfqd->prio_trees[ioprio]; + struct rb_node **p, *parent; + struct cfq_queue *cfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + cfqq = rb_entry(parent, struct cfq_queue, p_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > cfqq->next_crq->request->sector) + n = &(*p)->rb_right; + else if (sector < cfqq->next_crq->request->sector) + n = &(*p)->rb_left; + else + break; + p = n; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + return NULL; +} + +static void rb_erase_init(struct rb_node *n, struct rb_root *root) +{ + rb_erase(n, root); + RB_CLEAR_NODE(n); +} + +static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio]; + struct rb_node **p, *parent; + struct cfq_queue *__cfqq; + + if (!RB_EMPTY_NODE(&cfqq->p_node)) + rb_erase_init(&cfqq->p_node, root); + + if (cfq_class_idle(cfqq)) + return; + if (!cfqq->next_crq) + return; + + __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, + cfqq->next_crq->request->sector, + &parent, &p); + BUG_ON(__cfqq); + + rb_link_node(&cfqq->p_node, parent, p); + rb_insert_color(&cfqq->p_node, root); +} + static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) { struct cfq_data *cfqd = cfqq->cfqd; @@ -510,6 +587,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) } list_add(&cfqq->cfq_list, entry); + cfq_prio_tree_add(cfqd, cfqq); } /* @@ -532,6 +610,8 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); list_move(&cfqq->cfq_list, &cfqd->empty_list); + if (!RB_EMPTY_NODE(&cfqq->p_node)) + rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; @@ -585,7 +665,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq) struct cfq_queue *cfqq = crq->cfq_queue; struct cfq_data *cfqd = cfqq->cfqd; struct request *rq = crq->request; - struct cfq_rq *__alias; + struct cfq_rq *__alias, *prev; crq->rb_key = rq_rb_key(rq); cfqq->queued[cfq_crq_is_sync(crq)]++; @@ -605,7 +685,14 @@ static void cfq_add_crq_rb(struct cfq_rq *crq) /* * check if this request is a better next-serve candidate */ + prev = cfqq->next_crq; cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); + + /* + * adjust priority tree position, if ->next_crq changes + */ + if (prev != cfqq->next_crq) + cfq_prio_tree_add(cfqd, cfqq); } static inline void @@ -875,9 +962,23 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, return cfqd->last_position - rq->sector; } -static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) +static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) { - struct cfq_queue *cfqq = NULL; + struct cfq_io_context *cic = cfqd->active_cic; + + if (!sample_valid(cic->seek_samples)) + return 0; + + return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; +} + +#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) + +static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + if (cfqq) + goto set_queue; /* * if current list is non-empty, grab first entry. if it is empty, @@ -907,21 +1008,97 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) mod_timer(&cfqd->idle_class_timer, end); } + if (cfqq) + cfq_clear_cfqq_coop(cfqq); + +set_queue: __cfq_set_active_queue(cfqd, cfqq); return cfqq; } -static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) +static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, + struct cfq_queue *cur_cfqq) { - struct cfq_io_context *cic = cfqd->active_cic; + struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio]; + struct rb_node *parent, *node; + struct cfq_queue *__cfqq; + sector_t sector = cfqd->last_position; - if (!sample_valid(cic->seek_samples)) - return 0; + if (RB_EMPTY_ROOT(root)) + return NULL; - return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio, + sector, &parent, NULL); + if (__cfqq) + return __cfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector. + */ + __cfqq = rb_entry(parent, struct cfq_queue, p_node); + if (cfq_rq_close(cfqd, __cfqq->next_crq->request)) + return __cfqq; + + if (__cfqq->next_crq->request->sector < sector) + node = rb_next(&__cfqq->p_node); + else + node = rb_prev(&__cfqq->p_node); + if (!node) + return NULL; + + __cfqq = rb_entry(node, struct cfq_queue, p_node); + if (cfq_rq_close(cfqd, __cfqq->next_crq->request)) + return __cfqq; + + return NULL; } -#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) +/* + * cfqd - obvious + * cur_cfqq - passed in so that we don't decide that the current queue is + * closely cooperating with itself. + * + * So, basically we're assuming that the cur_cfqq has dispatched at least + * one request, and that cfqd->last_position reflects a position on the disk + * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid + * assumption. + */ +static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, + struct cfq_queue *cur_cfqq, + int probe) +{ + struct cfq_queue *cfqq; + + /* + * A valid cfq_io_context is necessary to compare requests against + * the seek_mean of the current cfqq. + */ + if (!cfqd->active_cic) + return NULL; + + /* + * We should notice if some of the queues are cooperating, eg + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. + */ + cfqq = cfqq_close(cfqd, cur_cfqq); + if (!cfqq) + return NULL; + + if (cfq_cfqq_coop(cfqq)) + return NULL; + + if (!probe) + cfq_mark_cfqq_coop(cfqq); + return cfqq; +} + + static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) @@ -1040,7 +1217,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) { unsigned long now = jiffies; - struct cfq_queue *cfqq; + struct cfq_queue *cfqq, *new_cfqq = NULL; cfqq = cfqd->active_queue; if (!cfqq) @@ -1058,6 +1235,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) */ if (!RB_EMPTY_ROOT(&cfqq->sort_list)) goto keep_queue; + /* + * If another queue has a request waiting within our mean seek + * distance, let it run. The expire code will check for close + * cooperators and put the close queue at the front of the service + * tree. + */ + else if ((new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0))) + goto expire; else if (cfq_cfqq_dispatched(cfqq)) { cfqq = NULL; goto keep_queue; @@ -1069,7 +1254,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) expire: cfq_slice_expired(cfqd, 0); new_queue: - cfqq = cfq_set_active_queue(cfqd); + cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: return cfqq; } @@ -1495,6 +1680,7 @@ retry: INIT_HLIST_NODE(&cfqq->cfq_hash); INIT_LIST_HEAD(&cfqq->cfq_list); INIT_LIST_HEAD(&cfqq->fifo); + RB_CLEAR_NODE(&cfqq->p_node); cfqq->key = key; hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); @@ -1887,9 +2073,12 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) * or if we want to idle in case it has no pending requests. */ if (cfqd->active_queue == cfqq) { + const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); + if (time_after(now, cfqq->slice_end)) cfq_slice_expired(cfqd, 0); - else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) + else if (cfqq_empty && + !cfq_close_cooperator(cfqd, cfqq, 1) && sync) cfq_arm_slice_timer(cfqd, cfqq); }