[PATCH] deal with rmmod/put_io_context() races
[safe/jmp/linux-2.6] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
8  */
9 #include <linux/kernel.h>
10 #include <linux/fs.h>
11 #include <linux/blkdev.h>
12 #include <linux/elevator.h>
13 #include <linux/bio.h>
14 #include <linux/config.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/init.h>
18 #include <linux/compiler.h>
19 #include <linux/hash.h>
20 #include <linux/rbtree.h>
21 #include <linux/mempool.h>
22 #include <linux/ioprio.h>
23 #include <linux/writeback.h>
24
25 /*
26  * tunables
27  */
28 static const int cfq_quantum = 4;               /* max queue in one round of service */
29 static const int cfq_queued = 8;                /* minimum rq allocate limit per-queue*/
30 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
31 static const int cfq_back_max = 16 * 1024;      /* maximum backwards seek, in KiB */
32 static const int cfq_back_penalty = 2;          /* penalty of a backwards seek */
33
34 static const int cfq_slice_sync = HZ / 10;
35 static int cfq_slice_async = HZ / 25;
36 static const int cfq_slice_async_rq = 2;
37 static int cfq_slice_idle = HZ / 100;
38
39 #define CFQ_IDLE_GRACE          (HZ / 10)
40 #define CFQ_SLICE_SCALE         (5)
41
42 #define CFQ_KEY_ASYNC           (0)
43 #define CFQ_KEY_ANY             (0xffff)
44
45 /*
46  * disable queueing at the driver/hardware level
47  */
48 static const int cfq_max_depth = 2;
49
50 static DEFINE_RWLOCK(cfq_exit_lock);
51
52 /*
53  * for the hash of cfqq inside the cfqd
54  */
55 #define CFQ_QHASH_SHIFT         6
56 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
57 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
58
59 /*
60  * for the hash of crq inside the cfqq
61  */
62 #define CFQ_MHASH_SHIFT         6
63 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
64 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
65 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
66 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
67 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
68
69 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
70 #define list_entry_fifo(ptr)    list_entry((ptr), struct request, queuelist)
71
72 #define RQ_DATA(rq)             (rq)->elevator_private
73
74 /*
75  * rb-tree defines
76  */
77 #define RB_NONE                 (2)
78 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
79 #define RB_CLEAR_COLOR(node)    (node)->rb_color = RB_NONE
80 #define RB_CLEAR(node)          do {    \
81         (node)->rb_parent = NULL;       \
82         RB_CLEAR_COLOR((node));         \
83         (node)->rb_right = NULL;        \
84         (node)->rb_left = NULL;         \
85 } while (0)
86 #define RB_CLEAR_ROOT(root)     ((root)->rb_node = NULL)
87 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
88 #define rq_rb_key(rq)           (rq)->sector
89
90 static kmem_cache_t *crq_pool;
91 static kmem_cache_t *cfq_pool;
92 static kmem_cache_t *cfq_ioc_pool;
93
94 static atomic_t ioc_count = ATOMIC_INIT(0);
95 static struct completion *ioc_gone;
96
97 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
98 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
99 #define cfq_class_be(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
100 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
101
102 #define ASYNC                   (0)
103 #define SYNC                    (1)
104
105 #define cfq_cfqq_dispatched(cfqq)       \
106         ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
107
108 #define cfq_cfqq_class_sync(cfqq)       ((cfqq)->key != CFQ_KEY_ASYNC)
109
110 #define cfq_cfqq_sync(cfqq)             \
111         (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
112
113 /*
114  * Per block device queue structure
115  */
116 struct cfq_data {
117         atomic_t ref;
118         request_queue_t *queue;
119
120         /*
121          * rr list of queues with requests and the count of them
122          */
123         struct list_head rr_list[CFQ_PRIO_LISTS];
124         struct list_head busy_rr;
125         struct list_head cur_rr;
126         struct list_head idle_rr;
127         unsigned int busy_queues;
128
129         /*
130          * non-ordered list of empty cfqq's
131          */
132         struct list_head empty_list;
133
134         /*
135          * cfqq lookup hash
136          */
137         struct hlist_head *cfq_hash;
138
139         /*
140          * global crq hash for all queues
141          */
142         struct hlist_head *crq_hash;
143
144         unsigned int max_queued;
145
146         mempool_t *crq_pool;
147
148         int rq_in_driver;
149
150         /*
151          * schedule slice state info
152          */
153         /*
154          * idle window management
155          */
156         struct timer_list idle_slice_timer;
157         struct work_struct unplug_work;
158
159         struct cfq_queue *active_queue;
160         struct cfq_io_context *active_cic;
161         int cur_prio, cur_end_prio;
162         unsigned int dispatch_slice;
163
164         struct timer_list idle_class_timer;
165
166         sector_t last_sector;
167         unsigned long last_end_request;
168
169         unsigned int rq_starved;
170
171         /*
172          * tunables, see top of file
173          */
174         unsigned int cfq_quantum;
175         unsigned int cfq_queued;
176         unsigned int cfq_fifo_expire[2];
177         unsigned int cfq_back_penalty;
178         unsigned int cfq_back_max;
179         unsigned int cfq_slice[2];
180         unsigned int cfq_slice_async_rq;
181         unsigned int cfq_slice_idle;
182         unsigned int cfq_max_depth;
183
184         struct list_head cic_list;
185 };
186
187 /*
188  * Per process-grouping structure
189  */
190 struct cfq_queue {
191         /* reference count */
192         atomic_t ref;
193         /* parent cfq_data */
194         struct cfq_data *cfqd;
195         /* cfqq lookup hash */
196         struct hlist_node cfq_hash;
197         /* hash key */
198         unsigned int key;
199         /* on either rr or empty list of cfqd */
200         struct list_head cfq_list;
201         /* sorted list of pending requests */
202         struct rb_root sort_list;
203         /* if fifo isn't expired, next request to serve */
204         struct cfq_rq *next_crq;
205         /* requests queued in sort_list */
206         int queued[2];
207         /* currently allocated requests */
208         int allocated[2];
209         /* fifo list of requests in sort_list */
210         struct list_head fifo;
211
212         unsigned long slice_start;
213         unsigned long slice_end;
214         unsigned long slice_left;
215         unsigned long service_last;
216
217         /* number of requests that are on the dispatch list */
218         int on_dispatch[2];
219
220         /* io prio of this group */
221         unsigned short ioprio, org_ioprio;
222         unsigned short ioprio_class, org_ioprio_class;
223
224         /* various state flags, see below */
225         unsigned int flags;
226 };
227
228 struct cfq_rq {
229         struct rb_node rb_node;
230         sector_t rb_key;
231         struct request *request;
232         struct hlist_node hash;
233
234         struct cfq_queue *cfq_queue;
235         struct cfq_io_context *io_context;
236
237         unsigned int crq_flags;
238 };
239
240 enum cfqq_state_flags {
241         CFQ_CFQQ_FLAG_on_rr = 0,
242         CFQ_CFQQ_FLAG_wait_request,
243         CFQ_CFQQ_FLAG_must_alloc,
244         CFQ_CFQQ_FLAG_must_alloc_slice,
245         CFQ_CFQQ_FLAG_must_dispatch,
246         CFQ_CFQQ_FLAG_fifo_expire,
247         CFQ_CFQQ_FLAG_idle_window,
248         CFQ_CFQQ_FLAG_prio_changed,
249 };
250
251 #define CFQ_CFQQ_FNS(name)                                              \
252 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
253 {                                                                       \
254         cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);                     \
255 }                                                                       \
256 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
257 {                                                                       \
258         cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                    \
259 }                                                                       \
260 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
261 {                                                                       \
262         return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;        \
263 }
264
265 CFQ_CFQQ_FNS(on_rr);
266 CFQ_CFQQ_FNS(wait_request);
267 CFQ_CFQQ_FNS(must_alloc);
268 CFQ_CFQQ_FNS(must_alloc_slice);
269 CFQ_CFQQ_FNS(must_dispatch);
270 CFQ_CFQQ_FNS(fifo_expire);
271 CFQ_CFQQ_FNS(idle_window);
272 CFQ_CFQQ_FNS(prio_changed);
273 #undef CFQ_CFQQ_FNS
274
275 enum cfq_rq_state_flags {
276         CFQ_CRQ_FLAG_is_sync = 0,
277 };
278
279 #define CFQ_CRQ_FNS(name)                                               \
280 static inline void cfq_mark_crq_##name(struct cfq_rq *crq)              \
281 {                                                                       \
282         crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);                   \
283 }                                                                       \
284 static inline void cfq_clear_crq_##name(struct cfq_rq *crq)             \
285 {                                                                       \
286         crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);                  \
287 }                                                                       \
288 static inline int cfq_crq_##name(const struct cfq_rq *crq)              \
289 {                                                                       \
290         return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;      \
291 }
292
293 CFQ_CRQ_FNS(is_sync);
294 #undef CFQ_CRQ_FNS
295
296 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
297 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
298 static void cfq_put_cfqd(struct cfq_data *cfqd);
299
300 #define process_sync(tsk)       ((tsk)->flags & PF_SYNCWRITE)
301
302 /*
303  * lots of deadline iosched dupes, can be abstracted later...
304  */
305 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
306 {
307         hlist_del_init(&crq->hash);
308 }
309
310 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
311 {
312         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
313
314         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
315 }
316
317 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
318 {
319         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
320         struct hlist_node *entry, *next;
321
322         hlist_for_each_safe(entry, next, hash_list) {
323                 struct cfq_rq *crq = list_entry_hash(entry);
324                 struct request *__rq = crq->request;
325
326                 if (!rq_mergeable(__rq)) {
327                         cfq_del_crq_hash(crq);
328                         continue;
329                 }
330
331                 if (rq_hash_key(__rq) == offset)
332                         return __rq;
333         }
334
335         return NULL;
336 }
337
338 /*
339  * scheduler run of queue, if there are requests pending and no one in the
340  * driver that will restart queueing
341  */
342 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
343 {
344         if (cfqd->busy_queues)
345                 kblockd_schedule_work(&cfqd->unplug_work);
346 }
347
348 static int cfq_queue_empty(request_queue_t *q)
349 {
350         struct cfq_data *cfqd = q->elevator->elevator_data;
351
352         return !cfqd->busy_queues;
353 }
354
355 /*
356  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
357  * We choose the request that is closest to the head right now. Distance
358  * behind the head are penalized and only allowed to a certain extent.
359  */
360 static struct cfq_rq *
361 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
362 {
363         sector_t last, s1, s2, d1 = 0, d2 = 0;
364         int r1_wrap = 0, r2_wrap = 0;   /* requests are behind the disk head */
365         unsigned long back_max;
366
367         if (crq1 == NULL || crq1 == crq2)
368                 return crq2;
369         if (crq2 == NULL)
370                 return crq1;
371
372         if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
373                 return crq1;
374         else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
375                 return crq2;
376
377         s1 = crq1->request->sector;
378         s2 = crq2->request->sector;
379
380         last = cfqd->last_sector;
381
382         /*
383          * by definition, 1KiB is 2 sectors
384          */
385         back_max = cfqd->cfq_back_max * 2;
386
387         /*
388          * Strict one way elevator _except_ in the case where we allow
389          * short backward seeks which are biased as twice the cost of a
390          * similar forward seek.
391          */
392         if (s1 >= last)
393                 d1 = s1 - last;
394         else if (s1 + back_max >= last)
395                 d1 = (last - s1) * cfqd->cfq_back_penalty;
396         else
397                 r1_wrap = 1;
398
399         if (s2 >= last)
400                 d2 = s2 - last;
401         else if (s2 + back_max >= last)
402                 d2 = (last - s2) * cfqd->cfq_back_penalty;
403         else
404                 r2_wrap = 1;
405
406         /* Found required data */
407         if (!r1_wrap && r2_wrap)
408                 return crq1;
409         else if (!r2_wrap && r1_wrap)
410                 return crq2;
411         else if (r1_wrap && r2_wrap) {
412                 /* both behind the head */
413                 if (s1 <= s2)
414                         return crq1;
415                 else
416                         return crq2;
417         }
418
419         /* Both requests in front of the head */
420         if (d1 < d2)
421                 return crq1;
422         else if (d2 < d1)
423                 return crq2;
424         else {
425                 if (s1 >= s2)
426                         return crq1;
427                 else
428                         return crq2;
429         }
430 }
431
432 /*
433  * would be nice to take fifo expire time into account as well
434  */
435 static struct cfq_rq *
436 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
437                   struct cfq_rq *last)
438 {
439         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
440         struct rb_node *rbnext, *rbprev;
441
442         if (!(rbnext = rb_next(&last->rb_node))) {
443                 rbnext = rb_first(&cfqq->sort_list);
444                 if (rbnext == &last->rb_node)
445                         rbnext = NULL;
446         }
447
448         rbprev = rb_prev(&last->rb_node);
449
450         if (rbprev)
451                 crq_prev = rb_entry_crq(rbprev);
452         if (rbnext)
453                 crq_next = rb_entry_crq(rbnext);
454
455         return cfq_choose_req(cfqd, crq_next, crq_prev);
456 }
457
458 static void cfq_update_next_crq(struct cfq_rq *crq)
459 {
460         struct cfq_queue *cfqq = crq->cfq_queue;
461
462         if (cfqq->next_crq == crq)
463                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
464 }
465
466 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
467 {
468         struct cfq_data *cfqd = cfqq->cfqd;
469         struct list_head *list, *entry;
470
471         BUG_ON(!cfq_cfqq_on_rr(cfqq));
472
473         list_del(&cfqq->cfq_list);
474
475         if (cfq_class_rt(cfqq))
476                 list = &cfqd->cur_rr;
477         else if (cfq_class_idle(cfqq))
478                 list = &cfqd->idle_rr;
479         else {
480                 /*
481                  * if cfqq has requests in flight, don't allow it to be
482                  * found in cfq_set_active_queue before it has finished them.
483                  * this is done to increase fairness between a process that
484                  * has lots of io pending vs one that only generates one
485                  * sporadically or synchronously
486                  */
487                 if (cfq_cfqq_dispatched(cfqq))
488                         list = &cfqd->busy_rr;
489                 else
490                         list = &cfqd->rr_list[cfqq->ioprio];
491         }
492
493         /*
494          * if queue was preempted, just add to front to be fair. busy_rr
495          * isn't sorted.
496          */
497         if (preempted || list == &cfqd->busy_rr) {
498                 list_add(&cfqq->cfq_list, list);
499                 return;
500         }
501
502         /*
503          * sort by when queue was last serviced
504          */
505         entry = list;
506         while ((entry = entry->prev) != list) {
507                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
508
509                 if (!__cfqq->service_last)
510                         break;
511                 if (time_before(__cfqq->service_last, cfqq->service_last))
512                         break;
513         }
514
515         list_add(&cfqq->cfq_list, entry);
516 }
517
518 /*
519  * add to busy list of queues for service, trying to be fair in ordering
520  * the pending list according to last request service
521  */
522 static inline void
523 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
524 {
525         BUG_ON(cfq_cfqq_on_rr(cfqq));
526         cfq_mark_cfqq_on_rr(cfqq);
527         cfqd->busy_queues++;
528
529         cfq_resort_rr_list(cfqq, 0);
530 }
531
532 static inline void
533 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
534 {
535         BUG_ON(!cfq_cfqq_on_rr(cfqq));
536         cfq_clear_cfqq_on_rr(cfqq);
537         list_move(&cfqq->cfq_list, &cfqd->empty_list);
538
539         BUG_ON(!cfqd->busy_queues);
540         cfqd->busy_queues--;
541 }
542
543 /*
544  * rb tree support functions
545  */
546 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
547 {
548         struct cfq_queue *cfqq = crq->cfq_queue;
549         struct cfq_data *cfqd = cfqq->cfqd;
550         const int sync = cfq_crq_is_sync(crq);
551
552         BUG_ON(!cfqq->queued[sync]);
553         cfqq->queued[sync]--;
554
555         cfq_update_next_crq(crq);
556
557         rb_erase(&crq->rb_node, &cfqq->sort_list);
558         RB_CLEAR_COLOR(&crq->rb_node);
559
560         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
561                 cfq_del_cfqq_rr(cfqd, cfqq);
562 }
563
564 static struct cfq_rq *
565 __cfq_add_crq_rb(struct cfq_rq *crq)
566 {
567         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
568         struct rb_node *parent = NULL;
569         struct cfq_rq *__crq;
570
571         while (*p) {
572                 parent = *p;
573                 __crq = rb_entry_crq(parent);
574
575                 if (crq->rb_key < __crq->rb_key)
576                         p = &(*p)->rb_left;
577                 else if (crq->rb_key > __crq->rb_key)
578                         p = &(*p)->rb_right;
579                 else
580                         return __crq;
581         }
582
583         rb_link_node(&crq->rb_node, parent, p);
584         return NULL;
585 }
586
587 static void cfq_add_crq_rb(struct cfq_rq *crq)
588 {
589         struct cfq_queue *cfqq = crq->cfq_queue;
590         struct cfq_data *cfqd = cfqq->cfqd;
591         struct request *rq = crq->request;
592         struct cfq_rq *__alias;
593
594         crq->rb_key = rq_rb_key(rq);
595         cfqq->queued[cfq_crq_is_sync(crq)]++;
596
597         /*
598          * looks a little odd, but the first insert might return an alias.
599          * if that happens, put the alias on the dispatch list
600          */
601         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
602                 cfq_dispatch_insert(cfqd->queue, __alias);
603
604         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
605
606         if (!cfq_cfqq_on_rr(cfqq))
607                 cfq_add_cfqq_rr(cfqd, cfqq);
608
609         /*
610          * check if this request is a better next-serve candidate
611          */
612         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
613 }
614
615 static inline void
616 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
617 {
618         rb_erase(&crq->rb_node, &cfqq->sort_list);
619         cfqq->queued[cfq_crq_is_sync(crq)]--;
620
621         cfq_add_crq_rb(crq);
622 }
623
624 static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
625
626 {
627         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
628         struct rb_node *n;
629
630         if (!cfqq)
631                 goto out;
632
633         n = cfqq->sort_list.rb_node;
634         while (n) {
635                 struct cfq_rq *crq = rb_entry_crq(n);
636
637                 if (sector < crq->rb_key)
638                         n = n->rb_left;
639                 else if (sector > crq->rb_key)
640                         n = n->rb_right;
641                 else
642                         return crq->request;
643         }
644
645 out:
646         return NULL;
647 }
648
649 static void cfq_activate_request(request_queue_t *q, struct request *rq)
650 {
651         struct cfq_data *cfqd = q->elevator->elevator_data;
652
653         cfqd->rq_in_driver++;
654 }
655
656 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
657 {
658         struct cfq_data *cfqd = q->elevator->elevator_data;
659
660         WARN_ON(!cfqd->rq_in_driver);
661         cfqd->rq_in_driver--;
662 }
663
664 static void cfq_remove_request(struct request *rq)
665 {
666         struct cfq_rq *crq = RQ_DATA(rq);
667
668         list_del_init(&rq->queuelist);
669         cfq_del_crq_rb(crq);
670         cfq_del_crq_hash(crq);
671 }
672
673 static int
674 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
675 {
676         struct cfq_data *cfqd = q->elevator->elevator_data;
677         struct request *__rq;
678         int ret;
679
680         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
681         if (__rq && elv_rq_merge_ok(__rq, bio)) {
682                 ret = ELEVATOR_BACK_MERGE;
683                 goto out;
684         }
685
686         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
687         if (__rq && elv_rq_merge_ok(__rq, bio)) {
688                 ret = ELEVATOR_FRONT_MERGE;
689                 goto out;
690         }
691
692         return ELEVATOR_NO_MERGE;
693 out:
694         *req = __rq;
695         return ret;
696 }
697
698 static void cfq_merged_request(request_queue_t *q, struct request *req)
699 {
700         struct cfq_data *cfqd = q->elevator->elevator_data;
701         struct cfq_rq *crq = RQ_DATA(req);
702
703         cfq_del_crq_hash(crq);
704         cfq_add_crq_hash(cfqd, crq);
705
706         if (rq_rb_key(req) != crq->rb_key) {
707                 struct cfq_queue *cfqq = crq->cfq_queue;
708
709                 cfq_update_next_crq(crq);
710                 cfq_reposition_crq_rb(cfqq, crq);
711         }
712 }
713
714 static void
715 cfq_merged_requests(request_queue_t *q, struct request *rq,
716                     struct request *next)
717 {
718         cfq_merged_request(q, rq);
719
720         /*
721          * reposition in fifo if next is older than rq
722          */
723         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
724             time_before(next->start_time, rq->start_time))
725                 list_move(&rq->queuelist, &next->queuelist);
726
727         cfq_remove_request(next);
728 }
729
730 static inline void
731 __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
732 {
733         if (cfqq) {
734                 /*
735                  * stop potential idle class queues waiting service
736                  */
737                 del_timer(&cfqd->idle_class_timer);
738
739                 cfqq->slice_start = jiffies;
740                 cfqq->slice_end = 0;
741                 cfqq->slice_left = 0;
742                 cfq_clear_cfqq_must_alloc_slice(cfqq);
743                 cfq_clear_cfqq_fifo_expire(cfqq);
744         }
745
746         cfqd->active_queue = cfqq;
747 }
748
749 /*
750  * current cfqq expired its slice (or was too idle), select new one
751  */
752 static void
753 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
754                     int preempted)
755 {
756         unsigned long now = jiffies;
757
758         if (cfq_cfqq_wait_request(cfqq))
759                 del_timer(&cfqd->idle_slice_timer);
760
761         if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
762                 cfqq->service_last = now;
763                 cfq_schedule_dispatch(cfqd);
764         }
765
766         cfq_clear_cfqq_must_dispatch(cfqq);
767         cfq_clear_cfqq_wait_request(cfqq);
768
769         /*
770          * store what was left of this slice, if the queue idled out
771          * or was preempted
772          */
773         if (time_after(cfqq->slice_end, now))
774                 cfqq->slice_left = cfqq->slice_end - now;
775         else
776                 cfqq->slice_left = 0;
777
778         if (cfq_cfqq_on_rr(cfqq))
779                 cfq_resort_rr_list(cfqq, preempted);
780
781         if (cfqq == cfqd->active_queue)
782                 cfqd->active_queue = NULL;
783
784         if (cfqd->active_cic) {
785                 put_io_context(cfqd->active_cic->ioc);
786                 cfqd->active_cic = NULL;
787         }
788
789         cfqd->dispatch_slice = 0;
790 }
791
792 static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
793 {
794         struct cfq_queue *cfqq = cfqd->active_queue;
795
796         if (cfqq)
797                 __cfq_slice_expired(cfqd, cfqq, preempted);
798 }
799
800 /*
801  * 0
802  * 0,1
803  * 0,1,2
804  * 0,1,2,3
805  * 0,1,2,3,4
806  * 0,1,2,3,4,5
807  * 0,1,2,3,4,5,6
808  * 0,1,2,3,4,5,6,7
809  */
810 static int cfq_get_next_prio_level(struct cfq_data *cfqd)
811 {
812         int prio, wrap;
813
814         prio = -1;
815         wrap = 0;
816         do {
817                 int p;
818
819                 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
820                         if (!list_empty(&cfqd->rr_list[p])) {
821                                 prio = p;
822                                 break;
823                         }
824                 }
825
826                 if (prio != -1)
827                         break;
828                 cfqd->cur_prio = 0;
829                 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
830                         cfqd->cur_end_prio = 0;
831                         if (wrap)
832                                 break;
833                         wrap = 1;
834                 }
835         } while (1);
836
837         if (unlikely(prio == -1))
838                 return -1;
839
840         BUG_ON(prio >= CFQ_PRIO_LISTS);
841
842         list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
843
844         cfqd->cur_prio = prio + 1;
845         if (cfqd->cur_prio > cfqd->cur_end_prio) {
846                 cfqd->cur_end_prio = cfqd->cur_prio;
847                 cfqd->cur_prio = 0;
848         }
849         if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
850                 cfqd->cur_prio = 0;
851                 cfqd->cur_end_prio = 0;
852         }
853
854         return prio;
855 }
856
857 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
858 {
859         struct cfq_queue *cfqq = NULL;
860
861         /*
862          * if current list is non-empty, grab first entry. if it is empty,
863          * get next prio level and grab first entry then if any are spliced
864          */
865         if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
866                 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
867
868         /*
869          * if we have idle queues and no rt or be queues had pending
870          * requests, either allow immediate service if the grace period
871          * has passed or arm the idle grace timer
872          */
873         if (!cfqq && !list_empty(&cfqd->idle_rr)) {
874                 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
875
876                 if (time_after_eq(jiffies, end))
877                         cfqq = list_entry_cfqq(cfqd->idle_rr.next);
878                 else
879                         mod_timer(&cfqd->idle_class_timer, end);
880         }
881
882         __cfq_set_active_queue(cfqd, cfqq);
883         return cfqq;
884 }
885
886 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
887
888 {
889         unsigned long sl;
890
891         WARN_ON(!RB_EMPTY(&cfqq->sort_list));
892         WARN_ON(cfqq != cfqd->active_queue);
893
894         /*
895          * idle is disabled, either manually or by past process history
896          */
897         if (!cfqd->cfq_slice_idle)
898                 return 0;
899         if (!cfq_cfqq_idle_window(cfqq))
900                 return 0;
901         /*
902          * task has exited, don't wait
903          */
904         if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
905                 return 0;
906
907         cfq_mark_cfqq_must_dispatch(cfqq);
908         cfq_mark_cfqq_wait_request(cfqq);
909
910         sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
911         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
912         return 1;
913 }
914
915 static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
916 {
917         struct cfq_data *cfqd = q->elevator->elevator_data;
918         struct cfq_queue *cfqq = crq->cfq_queue;
919
920         cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
921         cfq_remove_request(crq->request);
922         cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
923         elv_dispatch_sort(q, crq->request);
924 }
925
926 /*
927  * return expired entry, or NULL to just start from scratch in rbtree
928  */
929 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
930 {
931         struct cfq_data *cfqd = cfqq->cfqd;
932         struct request *rq;
933         struct cfq_rq *crq;
934
935         if (cfq_cfqq_fifo_expire(cfqq))
936                 return NULL;
937
938         if (!list_empty(&cfqq->fifo)) {
939                 int fifo = cfq_cfqq_class_sync(cfqq);
940
941                 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
942                 rq = crq->request;
943                 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
944                         cfq_mark_cfqq_fifo_expire(cfqq);
945                         return crq;
946                 }
947         }
948
949         return NULL;
950 }
951
952 /*
953  * Scale schedule slice based on io priority. Use the sync time slice only
954  * if a queue is marked sync and has sync io queued. A sync queue with async
955  * io only, should not get full sync slice length.
956  */
957 static inline int
958 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
959 {
960         const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
961
962         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
963
964         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
965 }
966
967 static inline void
968 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
969 {
970         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
971 }
972
973 static inline int
974 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
975 {
976         const int base_rq = cfqd->cfq_slice_async_rq;
977
978         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
979
980         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
981 }
982
983 /*
984  * get next queue for service
985  */
986 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
987 {
988         unsigned long now = jiffies;
989         struct cfq_queue *cfqq;
990
991         cfqq = cfqd->active_queue;
992         if (!cfqq)
993                 goto new_queue;
994
995         /*
996          * slice has expired
997          */
998         if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
999                 goto expire;
1000
1001         /*
1002          * if queue has requests, dispatch one. if not, check if
1003          * enough slice is left to wait for one
1004          */
1005         if (!RB_EMPTY(&cfqq->sort_list))
1006                 goto keep_queue;
1007         else if (cfq_cfqq_class_sync(cfqq) &&
1008                  time_before(now, cfqq->slice_end)) {
1009                 if (cfq_arm_slice_timer(cfqd, cfqq))
1010                         return NULL;
1011         }
1012
1013 expire:
1014         cfq_slice_expired(cfqd, 0);
1015 new_queue:
1016         cfqq = cfq_set_active_queue(cfqd);
1017 keep_queue:
1018         return cfqq;
1019 }
1020
1021 static int
1022 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1023                         int max_dispatch)
1024 {
1025         int dispatched = 0;
1026
1027         BUG_ON(RB_EMPTY(&cfqq->sort_list));
1028
1029         do {
1030                 struct cfq_rq *crq;
1031
1032                 /*
1033                  * follow expired path, else get first next available
1034                  */
1035                 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1036                         crq = cfqq->next_crq;
1037
1038                 /*
1039                  * finally, insert request into driver dispatch list
1040                  */
1041                 cfq_dispatch_insert(cfqd->queue, crq);
1042
1043                 cfqd->dispatch_slice++;
1044                 dispatched++;
1045
1046                 if (!cfqd->active_cic) {
1047                         atomic_inc(&crq->io_context->ioc->refcount);
1048                         cfqd->active_cic = crq->io_context;
1049                 }
1050
1051                 if (RB_EMPTY(&cfqq->sort_list))
1052                         break;
1053
1054         } while (dispatched < max_dispatch);
1055
1056         /*
1057          * if slice end isn't set yet, set it. if at least one request was
1058          * sync, use the sync time slice value
1059          */
1060         if (!cfqq->slice_end)
1061                 cfq_set_prio_slice(cfqd, cfqq);
1062
1063         /*
1064          * expire an async queue immediately if it has used up its slice. idle
1065          * queue always expire after 1 dispatch round.
1066          */
1067         if ((!cfq_cfqq_sync(cfqq) &&
1068             cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1069             cfq_class_idle(cfqq))
1070                 cfq_slice_expired(cfqd, 0);
1071
1072         return dispatched;
1073 }
1074
1075 static int
1076 cfq_forced_dispatch_cfqqs(struct list_head *list)
1077 {
1078         int dispatched = 0;
1079         struct cfq_queue *cfqq, *next;
1080         struct cfq_rq *crq;
1081
1082         list_for_each_entry_safe(cfqq, next, list, cfq_list) {
1083                 while ((crq = cfqq->next_crq)) {
1084                         cfq_dispatch_insert(cfqq->cfqd->queue, crq);
1085                         dispatched++;
1086                 }
1087                 BUG_ON(!list_empty(&cfqq->fifo));
1088         }
1089         return dispatched;
1090 }
1091
1092 static int
1093 cfq_forced_dispatch(struct cfq_data *cfqd)
1094 {
1095         int i, dispatched = 0;
1096
1097         for (i = 0; i < CFQ_PRIO_LISTS; i++)
1098                 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
1099
1100         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
1101         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1102         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1103
1104         cfq_slice_expired(cfqd, 0);
1105
1106         BUG_ON(cfqd->busy_queues);
1107
1108         return dispatched;
1109 }
1110
1111 static int
1112 cfq_dispatch_requests(request_queue_t *q, int force)
1113 {
1114         struct cfq_data *cfqd = q->elevator->elevator_data;
1115         struct cfq_queue *cfqq;
1116
1117         if (!cfqd->busy_queues)
1118                 return 0;
1119
1120         if (unlikely(force))
1121                 return cfq_forced_dispatch(cfqd);
1122
1123         cfqq = cfq_select_queue(cfqd);
1124         if (cfqq) {
1125                 int max_dispatch;
1126
1127                 /*
1128                  * if idle window is disabled, allow queue buildup
1129                  */
1130                 if (!cfq_cfqq_idle_window(cfqq) &&
1131                     cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1132                         return 0;
1133
1134                 cfq_clear_cfqq_must_dispatch(cfqq);
1135                 cfq_clear_cfqq_wait_request(cfqq);
1136                 del_timer(&cfqd->idle_slice_timer);
1137
1138                 max_dispatch = cfqd->cfq_quantum;
1139                 if (cfq_class_idle(cfqq))
1140                         max_dispatch = 1;
1141
1142                 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1143         }
1144
1145         return 0;
1146 }
1147
1148 /*
1149  * task holds one reference to the queue, dropped when task exits. each crq
1150  * in-flight on this queue also holds a reference, dropped when crq is freed.
1151  *
1152  * queue lock must be held here.
1153  */
1154 static void cfq_put_queue(struct cfq_queue *cfqq)
1155 {
1156         struct cfq_data *cfqd = cfqq->cfqd;
1157
1158         BUG_ON(atomic_read(&cfqq->ref) <= 0);
1159
1160         if (!atomic_dec_and_test(&cfqq->ref))
1161                 return;
1162
1163         BUG_ON(rb_first(&cfqq->sort_list));
1164         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1165         BUG_ON(cfq_cfqq_on_rr(cfqq));
1166
1167         if (unlikely(cfqd->active_queue == cfqq))
1168                 __cfq_slice_expired(cfqd, cfqq, 0);
1169
1170         cfq_put_cfqd(cfqq->cfqd);
1171
1172         /*
1173          * it's on the empty list and still hashed
1174          */
1175         list_del(&cfqq->cfq_list);
1176         hlist_del(&cfqq->cfq_hash);
1177         kmem_cache_free(cfq_pool, cfqq);
1178 }
1179
1180 static inline struct cfq_queue *
1181 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1182                     const int hashval)
1183 {
1184         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1185         struct hlist_node *entry, *next;
1186
1187         hlist_for_each_safe(entry, next, hash_list) {
1188                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
1189                 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1190
1191                 if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
1192                         return __cfqq;
1193         }
1194
1195         return NULL;
1196 }
1197
1198 static struct cfq_queue *
1199 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1200 {
1201         return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1202 }
1203
1204 static void cfq_free_io_context(struct cfq_io_context *cic)
1205 {
1206         struct cfq_io_context *__cic;
1207         struct list_head *entry, *next;
1208         int freed = 1;
1209
1210         list_for_each_safe(entry, next, &cic->list) {
1211                 __cic = list_entry(entry, struct cfq_io_context, list);
1212                 kmem_cache_free(cfq_ioc_pool, __cic);
1213                 freed++;
1214         }
1215
1216         kmem_cache_free(cfq_ioc_pool, cic);
1217         if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
1218                 complete(ioc_gone);
1219 }
1220
1221 static void cfq_trim(struct io_context *ioc)
1222 {
1223         ioc->set_ioprio = NULL;
1224         if (ioc->cic)
1225                 cfq_free_io_context(ioc->cic);
1226 }
1227
1228 /*
1229  * Called with interrupts disabled
1230  */
1231 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1232 {
1233         struct cfq_data *cfqd = cic->key;
1234         request_queue_t *q;
1235
1236         if (!cfqd)
1237                 return;
1238
1239         q = cfqd->queue;
1240
1241         WARN_ON(!irqs_disabled());
1242
1243         spin_lock(q->queue_lock);
1244
1245         if (cic->cfqq[ASYNC]) {
1246                 if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
1247                         __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
1248                 cfq_put_queue(cic->cfqq[ASYNC]);
1249                 cic->cfqq[ASYNC] = NULL;
1250         }
1251
1252         if (cic->cfqq[SYNC]) {
1253                 if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
1254                         __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
1255                 cfq_put_queue(cic->cfqq[SYNC]);
1256                 cic->cfqq[SYNC] = NULL;
1257         }
1258
1259         cic->key = NULL;
1260         list_del_init(&cic->queue_list);
1261         spin_unlock(q->queue_lock);
1262 }
1263
1264 /*
1265  * Another task may update the task cic list, if it is doing a queue lookup
1266  * on its behalf. cfq_cic_lock excludes such concurrent updates
1267  */
1268 static void cfq_exit_io_context(struct cfq_io_context *cic)
1269 {
1270         struct cfq_io_context *__cic;
1271         struct list_head *entry;
1272         unsigned long flags;
1273
1274         local_irq_save(flags);
1275
1276         /*
1277          * put the reference this task is holding to the various queues
1278          */
1279         read_lock(&cfq_exit_lock);
1280         list_for_each(entry, &cic->list) {
1281                 __cic = list_entry(entry, struct cfq_io_context, list);
1282                 cfq_exit_single_io_context(__cic);
1283         }
1284
1285         cfq_exit_single_io_context(cic);
1286         read_unlock(&cfq_exit_lock);
1287         local_irq_restore(flags);
1288 }
1289
1290 static struct cfq_io_context *
1291 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1292 {
1293         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1294
1295         if (cic) {
1296                 INIT_LIST_HEAD(&cic->list);
1297                 cic->cfqq[ASYNC] = NULL;
1298                 cic->cfqq[SYNC] = NULL;
1299                 cic->key = NULL;
1300                 cic->last_end_request = jiffies;
1301                 cic->ttime_total = 0;
1302                 cic->ttime_samples = 0;
1303                 cic->ttime_mean = 0;
1304                 cic->dtor = cfq_free_io_context;
1305                 cic->exit = cfq_exit_io_context;
1306                 INIT_LIST_HEAD(&cic->queue_list);
1307                 atomic_inc(&ioc_count);
1308         }
1309
1310         return cic;
1311 }
1312
1313 static void cfq_init_prio_data(struct cfq_queue *cfqq)
1314 {
1315         struct task_struct *tsk = current;
1316         int ioprio_class;
1317
1318         if (!cfq_cfqq_prio_changed(cfqq))
1319                 return;
1320
1321         ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1322         switch (ioprio_class) {
1323                 default:
1324                         printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1325                 case IOPRIO_CLASS_NONE:
1326                         /*
1327                          * no prio set, place us in the middle of the BE classes
1328                          */
1329                         cfqq->ioprio = task_nice_ioprio(tsk);
1330                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1331                         break;
1332                 case IOPRIO_CLASS_RT:
1333                         cfqq->ioprio = task_ioprio(tsk);
1334                         cfqq->ioprio_class = IOPRIO_CLASS_RT;
1335                         break;
1336                 case IOPRIO_CLASS_BE:
1337                         cfqq->ioprio = task_ioprio(tsk);
1338                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1339                         break;
1340                 case IOPRIO_CLASS_IDLE:
1341                         cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1342                         cfqq->ioprio = 7;
1343                         cfq_clear_cfqq_idle_window(cfqq);
1344                         break;
1345         }
1346
1347         /*
1348          * keep track of original prio settings in case we have to temporarily
1349          * elevate the priority of this queue
1350          */
1351         cfqq->org_ioprio = cfqq->ioprio;
1352         cfqq->org_ioprio_class = cfqq->ioprio_class;
1353
1354         if (cfq_cfqq_on_rr(cfqq))
1355                 cfq_resort_rr_list(cfqq, 0);
1356
1357         cfq_clear_cfqq_prio_changed(cfqq);
1358 }
1359
1360 static inline void changed_ioprio(struct cfq_io_context *cic)
1361 {
1362         struct cfq_data *cfqd = cic->key;
1363         struct cfq_queue *cfqq;
1364         if (cfqd) {
1365                 spin_lock(cfqd->queue->queue_lock);
1366                 cfqq = cic->cfqq[ASYNC];
1367                 if (cfqq) {
1368                         cfq_mark_cfqq_prio_changed(cfqq);
1369                         cfq_init_prio_data(cfqq);
1370                 }
1371                 cfqq = cic->cfqq[SYNC];
1372                 if (cfqq) {
1373                         cfq_mark_cfqq_prio_changed(cfqq);
1374                         cfq_init_prio_data(cfqq);
1375                 }
1376                 spin_unlock(cfqd->queue->queue_lock);
1377         }
1378 }
1379
1380 /*
1381  * callback from sys_ioprio_set, irqs are disabled
1382  */
1383 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1384 {
1385         struct cfq_io_context *cic;
1386
1387         write_lock(&cfq_exit_lock);
1388
1389         cic = ioc->cic;
1390
1391         changed_ioprio(cic);
1392
1393         list_for_each_entry(cic, &cic->list, list)
1394                 changed_ioprio(cic);
1395
1396         write_unlock(&cfq_exit_lock);
1397
1398         return 0;
1399 }
1400
1401 static struct cfq_queue *
1402 cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
1403               gfp_t gfp_mask)
1404 {
1405         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1406         struct cfq_queue *cfqq, *new_cfqq = NULL;
1407
1408 retry:
1409         cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1410
1411         if (!cfqq) {
1412                 if (new_cfqq) {
1413                         cfqq = new_cfqq;
1414                         new_cfqq = NULL;
1415                 } else if (gfp_mask & __GFP_WAIT) {
1416                         spin_unlock_irq(cfqd->queue->queue_lock);
1417                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1418                         spin_lock_irq(cfqd->queue->queue_lock);
1419                         goto retry;
1420                 } else {
1421                         cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1422                         if (!cfqq)
1423                                 goto out;
1424                 }
1425
1426                 memset(cfqq, 0, sizeof(*cfqq));
1427
1428                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1429                 INIT_LIST_HEAD(&cfqq->cfq_list);
1430                 RB_CLEAR_ROOT(&cfqq->sort_list);
1431                 INIT_LIST_HEAD(&cfqq->fifo);
1432
1433                 cfqq->key = key;
1434                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1435                 atomic_set(&cfqq->ref, 0);
1436                 cfqq->cfqd = cfqd;
1437                 atomic_inc(&cfqd->ref);
1438                 cfqq->service_last = 0;
1439                 /*
1440                  * set ->slice_left to allow preemption for a new process
1441                  */
1442                 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1443                 cfq_mark_cfqq_idle_window(cfqq);
1444                 cfq_mark_cfqq_prio_changed(cfqq);
1445                 cfq_init_prio_data(cfqq);
1446         }
1447
1448         if (new_cfqq)
1449                 kmem_cache_free(cfq_pool, new_cfqq);
1450
1451         atomic_inc(&cfqq->ref);
1452 out:
1453         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1454         return cfqq;
1455 }
1456
1457 /*
1458  * Setup general io context and cfq io context. There can be several cfq
1459  * io contexts per general io context, if this process is doing io to more
1460  * than one device managed by cfq. Note that caller is holding a reference to
1461  * cfqq, so we don't need to worry about it disappearing
1462  */
1463 static struct cfq_io_context *
1464 cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
1465 {
1466         struct io_context *ioc = NULL;
1467         struct cfq_io_context *cic;
1468
1469         might_sleep_if(gfp_mask & __GFP_WAIT);
1470
1471         ioc = get_io_context(gfp_mask);
1472         if (!ioc)
1473                 return NULL;
1474
1475 restart:
1476         if ((cic = ioc->cic) == NULL) {
1477                 cic = cfq_alloc_io_context(cfqd, gfp_mask);
1478
1479                 if (cic == NULL)
1480                         goto err;
1481
1482                 /*
1483                  * manually increment generic io_context usage count, it
1484                  * cannot go away since we are already holding one ref to it
1485                  */
1486                 cic->ioc = ioc;
1487                 cic->key = cfqd;
1488                 read_lock(&cfq_exit_lock);
1489                 ioc->set_ioprio = cfq_ioc_set_ioprio;
1490                 ioc->cic = cic;
1491                 list_add(&cic->queue_list, &cfqd->cic_list);
1492                 read_unlock(&cfq_exit_lock);
1493         } else {
1494                 struct cfq_io_context *__cic;
1495
1496                 /*
1497                  * the first cic on the list is actually the head itself
1498                  */
1499                 if (cic->key == cfqd)
1500                         goto out;
1501
1502                 if (unlikely(!cic->key)) {
1503                         read_lock(&cfq_exit_lock);
1504                         if (list_empty(&cic->list))
1505                                 ioc->cic = NULL;
1506                         else
1507                                 ioc->cic = list_entry(cic->list.next,
1508                                                       struct cfq_io_context,
1509                                                       list);
1510                         read_unlock(&cfq_exit_lock);
1511                         kmem_cache_free(cfq_ioc_pool, cic);
1512                         atomic_dec(&ioc_count);
1513                         goto restart;
1514                 }
1515
1516                 /*
1517                  * cic exists, check if we already are there. linear search
1518                  * should be ok here, the list will usually not be more than
1519                  * 1 or a few entries long
1520                  */
1521                 list_for_each_entry(__cic, &cic->list, list) {
1522                         /*
1523                          * this process is already holding a reference to
1524                          * this queue, so no need to get one more
1525                          */
1526                         if (__cic->key == cfqd) {
1527                                 cic = __cic;
1528                                 goto out;
1529                         }
1530                         if (unlikely(!__cic->key)) {
1531                                 read_lock(&cfq_exit_lock);
1532                                 list_del(&__cic->list);
1533                                 read_unlock(&cfq_exit_lock);
1534                                 kmem_cache_free(cfq_ioc_pool, __cic);
1535                                 atomic_dec(&ioc_count);
1536                                 goto restart;
1537                         }
1538                 }
1539
1540                 /*
1541                  * nope, process doesn't have a cic assoicated with this
1542                  * cfqq yet. get a new one and add to list
1543                  */
1544                 __cic = cfq_alloc_io_context(cfqd, gfp_mask);
1545                 if (__cic == NULL)
1546                         goto err;
1547
1548                 __cic->ioc = ioc;
1549                 __cic->key = cfqd;
1550                 read_lock(&cfq_exit_lock);
1551                 list_add(&__cic->list, &cic->list);
1552                 list_add(&__cic->queue_list, &cfqd->cic_list);
1553                 read_unlock(&cfq_exit_lock);
1554                 cic = __cic;
1555         }
1556
1557 out:
1558         return cic;
1559 err:
1560         put_io_context(ioc);
1561         return NULL;
1562 }
1563
1564 static void
1565 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1566 {
1567         unsigned long elapsed, ttime;
1568
1569         /*
1570          * if this context already has stuff queued, thinktime is from
1571          * last queue not last end
1572          */
1573 #if 0
1574         if (time_after(cic->last_end_request, cic->last_queue))
1575                 elapsed = jiffies - cic->last_end_request;
1576         else
1577                 elapsed = jiffies - cic->last_queue;
1578 #else
1579                 elapsed = jiffies - cic->last_end_request;
1580 #endif
1581
1582         ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1583
1584         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1585         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1586         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1587 }
1588
1589 #define sample_valid(samples)   ((samples) > 80)
1590
1591 /*
1592  * Disable idle window if the process thinks too long or seeks so much that
1593  * it doesn't matter
1594  */
1595 static void
1596 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1597                        struct cfq_io_context *cic)
1598 {
1599         int enable_idle = cfq_cfqq_idle_window(cfqq);
1600
1601         if (!cic->ioc->task || !cfqd->cfq_slice_idle)
1602                 enable_idle = 0;
1603         else if (sample_valid(cic->ttime_samples)) {
1604                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1605                         enable_idle = 0;
1606                 else
1607                         enable_idle = 1;
1608         }
1609
1610         if (enable_idle)
1611                 cfq_mark_cfqq_idle_window(cfqq);
1612         else
1613                 cfq_clear_cfqq_idle_window(cfqq);
1614 }
1615
1616
1617 /*
1618  * Check if new_cfqq should preempt the currently active queue. Return 0 for
1619  * no or if we aren't sure, a 1 will cause a preempt.
1620  */
1621 static int
1622 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1623                    struct cfq_rq *crq)
1624 {
1625         struct cfq_queue *cfqq = cfqd->active_queue;
1626
1627         if (cfq_class_idle(new_cfqq))
1628                 return 0;
1629
1630         if (!cfqq)
1631                 return 1;
1632
1633         if (cfq_class_idle(cfqq))
1634                 return 1;
1635         if (!cfq_cfqq_wait_request(new_cfqq))
1636                 return 0;
1637         /*
1638          * if it doesn't have slice left, forget it
1639          */
1640         if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1641                 return 0;
1642         if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1643                 return 1;
1644
1645         return 0;
1646 }
1647
1648 /*
1649  * cfqq preempts the active queue. if we allowed preempt with no slice left,
1650  * let it have half of its nominal slice.
1651  */
1652 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1653 {
1654         struct cfq_queue *__cfqq, *next;
1655
1656         list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1657                 cfq_resort_rr_list(__cfqq, 1);
1658
1659         if (!cfqq->slice_left)
1660                 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1661
1662         cfqq->slice_end = cfqq->slice_left + jiffies;
1663         __cfq_slice_expired(cfqd, cfqq, 1);
1664         __cfq_set_active_queue(cfqd, cfqq);
1665 }
1666
1667 /*
1668  * should really be a ll_rw_blk.c helper
1669  */
1670 static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1671 {
1672         request_queue_t *q = cfqd->queue;
1673
1674         if (!blk_queue_plugged(q))
1675                 q->request_fn(q);
1676         else
1677                 __generic_unplug_device(q);
1678 }
1679
1680 /*
1681  * Called when a new fs request (crq) is added (to cfqq). Check if there's
1682  * something we should do about it
1683  */
1684 static void
1685 cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1686                  struct cfq_rq *crq)
1687 {
1688         struct cfq_io_context *cic;
1689
1690         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1691
1692         /*
1693          * we never wait for an async request and we don't allow preemption
1694          * of an async request. so just return early
1695          */
1696         if (!cfq_crq_is_sync(crq))
1697                 return;
1698
1699         cic = crq->io_context;
1700
1701         cfq_update_io_thinktime(cfqd, cic);
1702         cfq_update_idle_window(cfqd, cfqq, cic);
1703
1704         cic->last_queue = jiffies;
1705
1706         if (cfqq == cfqd->active_queue) {
1707                 /*
1708                  * if we are waiting for a request for this queue, let it rip
1709                  * immediately and flag that we must not expire this queue
1710                  * just now
1711                  */
1712                 if (cfq_cfqq_wait_request(cfqq)) {
1713                         cfq_mark_cfqq_must_dispatch(cfqq);
1714                         del_timer(&cfqd->idle_slice_timer);
1715                         cfq_start_queueing(cfqd, cfqq);
1716                 }
1717         } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1718                 /*
1719                  * not the active queue - expire current slice if it is
1720                  * idle and has expired it's mean thinktime or this new queue
1721                  * has some old slice time left and is of higher priority
1722                  */
1723                 cfq_preempt_queue(cfqd, cfqq);
1724                 cfq_mark_cfqq_must_dispatch(cfqq);
1725                 cfq_start_queueing(cfqd, cfqq);
1726         }
1727 }
1728
1729 static void cfq_insert_request(request_queue_t *q, struct request *rq)
1730 {
1731         struct cfq_data *cfqd = q->elevator->elevator_data;
1732         struct cfq_rq *crq = RQ_DATA(rq);
1733         struct cfq_queue *cfqq = crq->cfq_queue;
1734
1735         cfq_init_prio_data(cfqq);
1736
1737         cfq_add_crq_rb(crq);
1738
1739         list_add_tail(&rq->queuelist, &cfqq->fifo);
1740
1741         if (rq_mergeable(rq))
1742                 cfq_add_crq_hash(cfqd, crq);
1743
1744         cfq_crq_enqueued(cfqd, cfqq, crq);
1745 }
1746
1747 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1748 {
1749         struct cfq_rq *crq = RQ_DATA(rq);
1750         struct cfq_queue *cfqq = crq->cfq_queue;
1751         struct cfq_data *cfqd = cfqq->cfqd;
1752         const int sync = cfq_crq_is_sync(crq);
1753         unsigned long now;
1754
1755         now = jiffies;
1756
1757         WARN_ON(!cfqd->rq_in_driver);
1758         WARN_ON(!cfqq->on_dispatch[sync]);
1759         cfqd->rq_in_driver--;
1760         cfqq->on_dispatch[sync]--;
1761
1762         if (!cfq_class_idle(cfqq))
1763                 cfqd->last_end_request = now;
1764
1765         if (!cfq_cfqq_dispatched(cfqq)) {
1766                 if (cfq_cfqq_on_rr(cfqq)) {
1767                         cfqq->service_last = now;
1768                         cfq_resort_rr_list(cfqq, 0);
1769                 }
1770                 cfq_schedule_dispatch(cfqd);
1771         }
1772
1773         if (cfq_crq_is_sync(crq))
1774                 crq->io_context->last_end_request = now;
1775 }
1776
1777 static struct request *
1778 cfq_former_request(request_queue_t *q, struct request *rq)
1779 {
1780         struct cfq_rq *crq = RQ_DATA(rq);
1781         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1782
1783         if (rbprev)
1784                 return rb_entry_crq(rbprev)->request;
1785
1786         return NULL;
1787 }
1788
1789 static struct request *
1790 cfq_latter_request(request_queue_t *q, struct request *rq)
1791 {
1792         struct cfq_rq *crq = RQ_DATA(rq);
1793         struct rb_node *rbnext = rb_next(&crq->rb_node);
1794
1795         if (rbnext)
1796                 return rb_entry_crq(rbnext)->request;
1797
1798         return NULL;
1799 }
1800
1801 /*
1802  * we temporarily boost lower priority queues if they are holding fs exclusive
1803  * resources. they are boosted to normal prio (CLASS_BE/4)
1804  */
1805 static void cfq_prio_boost(struct cfq_queue *cfqq)
1806 {
1807         const int ioprio_class = cfqq->ioprio_class;
1808         const int ioprio = cfqq->ioprio;
1809
1810         if (has_fs_excl()) {
1811                 /*
1812                  * boost idle prio on transactions that would lock out other
1813                  * users of the filesystem
1814                  */
1815                 if (cfq_class_idle(cfqq))
1816                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1817                 if (cfqq->ioprio > IOPRIO_NORM)
1818                         cfqq->ioprio = IOPRIO_NORM;
1819         } else {
1820                 /*
1821                  * check if we need to unboost the queue
1822                  */
1823                 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1824                         cfqq->ioprio_class = cfqq->org_ioprio_class;
1825                 if (cfqq->ioprio != cfqq->org_ioprio)
1826                         cfqq->ioprio = cfqq->org_ioprio;
1827         }
1828
1829         /*
1830          * refile between round-robin lists if we moved the priority class
1831          */
1832         if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1833             cfq_cfqq_on_rr(cfqq))
1834                 cfq_resort_rr_list(cfqq, 0);
1835 }
1836
1837 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1838 {
1839         if (rw == READ || process_sync(task))
1840                 return task->pid;
1841
1842         return CFQ_KEY_ASYNC;
1843 }
1844
1845 static inline int
1846 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1847                 struct task_struct *task, int rw)
1848 {
1849 #if 1
1850         if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1851             !cfq_cfqq_must_alloc_slice(cfqq)) {
1852                 cfq_mark_cfqq_must_alloc_slice(cfqq);
1853                 return ELV_MQUEUE_MUST;
1854         }
1855
1856         return ELV_MQUEUE_MAY;
1857 #else
1858         if (!cfqq || task->flags & PF_MEMALLOC)
1859                 return ELV_MQUEUE_MAY;
1860         if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
1861                 if (cfq_cfqq_wait_request(cfqq))
1862                         return ELV_MQUEUE_MUST;
1863
1864                 /*
1865                  * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1866                  * can quickly flood the queue with writes from a single task
1867                  */
1868                 if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
1869                         cfq_mark_cfqq_must_alloc_slice(cfqq);
1870                         return ELV_MQUEUE_MUST;
1871                 }
1872
1873                 return ELV_MQUEUE_MAY;
1874         }
1875         if (cfq_class_idle(cfqq))
1876                 return ELV_MQUEUE_NO;
1877         if (cfqq->allocated[rw] >= cfqd->max_queued) {
1878                 struct io_context *ioc = get_io_context(GFP_ATOMIC);
1879                 int ret = ELV_MQUEUE_NO;
1880
1881                 if (ioc && ioc->nr_batch_requests)
1882                         ret = ELV_MQUEUE_MAY;
1883
1884                 put_io_context(ioc);
1885                 return ret;
1886         }
1887
1888         return ELV_MQUEUE_MAY;
1889 #endif
1890 }
1891
1892 static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1893 {
1894         struct cfq_data *cfqd = q->elevator->elevator_data;
1895         struct task_struct *tsk = current;
1896         struct cfq_queue *cfqq;
1897
1898         /*
1899          * don't force setup of a queue from here, as a call to may_queue
1900          * does not necessarily imply that a request actually will be queued.
1901          * so just lookup a possibly existing queue, or return 'may queue'
1902          * if that fails
1903          */
1904         cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1905         if (cfqq) {
1906                 cfq_init_prio_data(cfqq);
1907                 cfq_prio_boost(cfqq);
1908
1909                 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1910         }
1911
1912         return ELV_MQUEUE_MAY;
1913 }
1914
1915 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1916 {
1917         struct cfq_data *cfqd = q->elevator->elevator_data;
1918         struct request_list *rl = &q->rq;
1919
1920         if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1921                 smp_mb();
1922                 if (waitqueue_active(&rl->wait[READ]))
1923                         wake_up(&rl->wait[READ]);
1924         }
1925
1926         if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
1927                 smp_mb();
1928                 if (waitqueue_active(&rl->wait[WRITE]))
1929                         wake_up(&rl->wait[WRITE]);
1930         }
1931 }
1932
1933 /*
1934  * queue lock held here
1935  */
1936 static void cfq_put_request(request_queue_t *q, struct request *rq)
1937 {
1938         struct cfq_data *cfqd = q->elevator->elevator_data;
1939         struct cfq_rq *crq = RQ_DATA(rq);
1940
1941         if (crq) {
1942                 struct cfq_queue *cfqq = crq->cfq_queue;
1943                 const int rw = rq_data_dir(rq);
1944
1945                 BUG_ON(!cfqq->allocated[rw]);
1946                 cfqq->allocated[rw]--;
1947
1948                 put_io_context(crq->io_context->ioc);
1949
1950                 mempool_free(crq, cfqd->crq_pool);
1951                 rq->elevator_private = NULL;
1952
1953                 cfq_check_waiters(q, cfqq);
1954                 cfq_put_queue(cfqq);
1955         }
1956 }
1957
1958 /*
1959  * Allocate cfq data structures associated with this request.
1960  */
1961 static int
1962 cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1963                 gfp_t gfp_mask)
1964 {
1965         struct cfq_data *cfqd = q->elevator->elevator_data;
1966         struct task_struct *tsk = current;
1967         struct cfq_io_context *cic;
1968         const int rw = rq_data_dir(rq);
1969         pid_t key = cfq_queue_pid(tsk, rw);
1970         struct cfq_queue *cfqq;
1971         struct cfq_rq *crq;
1972         unsigned long flags;
1973         int is_sync = key != CFQ_KEY_ASYNC;
1974
1975         might_sleep_if(gfp_mask & __GFP_WAIT);
1976
1977         cic = cfq_get_io_context(cfqd, key, gfp_mask);
1978
1979         spin_lock_irqsave(q->queue_lock, flags);
1980
1981         if (!cic)
1982                 goto queue_fail;
1983
1984         if (!cic->cfqq[is_sync]) {
1985                 cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
1986                 if (!cfqq)
1987                         goto queue_fail;
1988
1989                 cic->cfqq[is_sync] = cfqq;
1990         } else
1991                 cfqq = cic->cfqq[is_sync];
1992
1993         cfqq->allocated[rw]++;
1994         cfq_clear_cfqq_must_alloc(cfqq);
1995         cfqd->rq_starved = 0;
1996         atomic_inc(&cfqq->ref);
1997         spin_unlock_irqrestore(q->queue_lock, flags);
1998
1999         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
2000         if (crq) {
2001                 RB_CLEAR(&crq->rb_node);
2002                 crq->rb_key = 0;
2003                 crq->request = rq;
2004                 INIT_HLIST_NODE(&crq->hash);
2005                 crq->cfq_queue = cfqq;
2006                 crq->io_context = cic;
2007
2008                 if (is_sync)
2009                         cfq_mark_crq_is_sync(crq);
2010                 else
2011                         cfq_clear_crq_is_sync(crq);
2012
2013                 rq->elevator_private = crq;
2014                 return 0;
2015         }
2016
2017         spin_lock_irqsave(q->queue_lock, flags);
2018         cfqq->allocated[rw]--;
2019         if (!(cfqq->allocated[0] + cfqq->allocated[1]))
2020                 cfq_mark_cfqq_must_alloc(cfqq);
2021         cfq_put_queue(cfqq);
2022 queue_fail:
2023         if (cic)
2024                 put_io_context(cic->ioc);
2025         /*
2026          * mark us rq allocation starved. we need to kickstart the process
2027          * ourselves if there are no pending requests that can do it for us.
2028          * that would be an extremely rare OOM situation
2029          */
2030         cfqd->rq_starved = 1;
2031         cfq_schedule_dispatch(cfqd);
2032         spin_unlock_irqrestore(q->queue_lock, flags);
2033         return 1;
2034 }
2035
2036 static void cfq_kick_queue(void *data)
2037 {
2038         request_queue_t *q = data;
2039         struct cfq_data *cfqd = q->elevator->elevator_data;
2040         unsigned long flags;
2041
2042         spin_lock_irqsave(q->queue_lock, flags);
2043
2044         if (cfqd->rq_starved) {
2045                 struct request_list *rl = &q->rq;
2046
2047                 /*
2048                  * we aren't guaranteed to get a request after this, but we
2049                  * have to be opportunistic
2050                  */
2051                 smp_mb();
2052                 if (waitqueue_active(&rl->wait[READ]))
2053                         wake_up(&rl->wait[READ]);
2054                 if (waitqueue_active(&rl->wait[WRITE]))
2055                         wake_up(&rl->wait[WRITE]);
2056         }
2057
2058         blk_remove_plug(q);
2059         q->request_fn(q);
2060         spin_unlock_irqrestore(q->queue_lock, flags);
2061 }
2062
2063 /*
2064  * Timer running if the active_queue is currently idling inside its time slice
2065  */
2066 static void cfq_idle_slice_timer(unsigned long data)
2067 {
2068         struct cfq_data *cfqd = (struct cfq_data *) data;
2069         struct cfq_queue *cfqq;
2070         unsigned long flags;
2071
2072         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2073
2074         if ((cfqq = cfqd->active_queue) != NULL) {
2075                 unsigned long now = jiffies;
2076
2077                 /*
2078                  * expired
2079                  */
2080                 if (time_after(now, cfqq->slice_end))
2081                         goto expire;
2082
2083                 /*
2084                  * only expire and reinvoke request handler, if there are
2085                  * other queues with pending requests
2086                  */
2087                 if (!cfqd->busy_queues) {
2088                         cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
2089                         add_timer(&cfqd->idle_slice_timer);
2090                         goto out_cont;
2091                 }
2092
2093                 /*
2094                  * not expired and it has a request pending, let it dispatch
2095                  */
2096                 if (!RB_EMPTY(&cfqq->sort_list)) {
2097                         cfq_mark_cfqq_must_dispatch(cfqq);
2098                         goto out_kick;
2099                 }
2100         }
2101 expire:
2102         cfq_slice_expired(cfqd, 0);
2103 out_kick:
2104         cfq_schedule_dispatch(cfqd);
2105 out_cont:
2106         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2107 }
2108
2109 /*
2110  * Timer running if an idle class queue is waiting for service
2111  */
2112 static void cfq_idle_class_timer(unsigned long data)
2113 {
2114         struct cfq_data *cfqd = (struct cfq_data *) data;
2115         unsigned long flags, end;
2116
2117         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2118
2119         /*
2120          * race with a non-idle queue, reset timer
2121          */
2122         end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2123         if (!time_after_eq(jiffies, end)) {
2124                 cfqd->idle_class_timer.expires = end;
2125                 add_timer(&cfqd->idle_class_timer);
2126         } else
2127                 cfq_schedule_dispatch(cfqd);
2128
2129         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2130 }
2131
2132 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2133 {
2134         del_timer_sync(&cfqd->idle_slice_timer);
2135         del_timer_sync(&cfqd->idle_class_timer);
2136         blk_sync_queue(cfqd->queue);
2137 }
2138
2139 static void cfq_put_cfqd(struct cfq_data *cfqd)
2140 {
2141         if (!atomic_dec_and_test(&cfqd->ref))
2142                 return;
2143
2144         cfq_shutdown_timer_wq(cfqd);
2145
2146         mempool_destroy(cfqd->crq_pool);
2147         kfree(cfqd->crq_hash);
2148         kfree(cfqd->cfq_hash);
2149         kfree(cfqd);
2150 }
2151
2152 static void cfq_exit_queue(elevator_t *e)
2153 {
2154         struct cfq_data *cfqd = e->elevator_data;
2155         request_queue_t *q = cfqd->queue;
2156
2157         cfq_shutdown_timer_wq(cfqd);
2158         write_lock(&cfq_exit_lock);
2159         spin_lock_irq(q->queue_lock);
2160         if (cfqd->active_queue)
2161                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2162         while(!list_empty(&cfqd->cic_list)) {
2163                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2164                                                         struct cfq_io_context,
2165                                                         queue_list);
2166                 if (cic->cfqq[ASYNC]) {
2167                         cfq_put_queue(cic->cfqq[ASYNC]);
2168                         cic->cfqq[ASYNC] = NULL;
2169                 }
2170                 if (cic->cfqq[SYNC]) {
2171                         cfq_put_queue(cic->cfqq[SYNC]);
2172                         cic->cfqq[SYNC] = NULL;
2173                 }
2174                 cic->key = NULL;
2175                 list_del_init(&cic->queue_list);
2176         }
2177         spin_unlock_irq(q->queue_lock);
2178         write_unlock(&cfq_exit_lock);
2179         cfq_put_cfqd(cfqd);
2180 }
2181
2182 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2183 {
2184         struct cfq_data *cfqd;
2185         int i;
2186
2187         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2188         if (!cfqd)
2189                 return -ENOMEM;
2190
2191         memset(cfqd, 0, sizeof(*cfqd));
2192
2193         for (i = 0; i < CFQ_PRIO_LISTS; i++)
2194                 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2195
2196         INIT_LIST_HEAD(&cfqd->busy_rr);
2197         INIT_LIST_HEAD(&cfqd->cur_rr);
2198         INIT_LIST_HEAD(&cfqd->idle_rr);
2199         INIT_LIST_HEAD(&cfqd->empty_list);
2200         INIT_LIST_HEAD(&cfqd->cic_list);
2201
2202         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2203         if (!cfqd->crq_hash)
2204                 goto out_crqhash;
2205
2206         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2207         if (!cfqd->cfq_hash)
2208                 goto out_cfqhash;
2209
2210         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
2211         if (!cfqd->crq_pool)
2212                 goto out_crqpool;
2213
2214         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2215                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2216         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2217                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2218
2219         e->elevator_data = cfqd;
2220
2221         cfqd->queue = q;
2222
2223         cfqd->max_queued = q->nr_requests / 4;
2224         q->nr_batching = cfq_queued;
2225
2226         init_timer(&cfqd->idle_slice_timer);
2227         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2228         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2229
2230         init_timer(&cfqd->idle_class_timer);
2231         cfqd->idle_class_timer.function = cfq_idle_class_timer;
2232         cfqd->idle_class_timer.data = (unsigned long) cfqd;
2233
2234         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2235
2236         atomic_set(&cfqd->ref, 1);
2237
2238         cfqd->cfq_queued = cfq_queued;
2239         cfqd->cfq_quantum = cfq_quantum;
2240         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2241         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2242         cfqd->cfq_back_max = cfq_back_max;
2243         cfqd->cfq_back_penalty = cfq_back_penalty;
2244         cfqd->cfq_slice[0] = cfq_slice_async;
2245         cfqd->cfq_slice[1] = cfq_slice_sync;
2246         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2247         cfqd->cfq_slice_idle = cfq_slice_idle;
2248         cfqd->cfq_max_depth = cfq_max_depth;
2249
2250         return 0;
2251 out_crqpool:
2252         kfree(cfqd->cfq_hash);
2253 out_cfqhash:
2254         kfree(cfqd->crq_hash);
2255 out_crqhash:
2256         kfree(cfqd);
2257         return -ENOMEM;
2258 }
2259
2260 static void cfq_slab_kill(void)
2261 {
2262         if (crq_pool)
2263                 kmem_cache_destroy(crq_pool);
2264         if (cfq_pool)
2265                 kmem_cache_destroy(cfq_pool);
2266         if (cfq_ioc_pool)
2267                 kmem_cache_destroy(cfq_ioc_pool);
2268 }
2269
2270 static int __init cfq_slab_setup(void)
2271 {
2272         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2273                                         NULL, NULL);
2274         if (!crq_pool)
2275                 goto fail;
2276
2277         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2278                                         NULL, NULL);
2279         if (!cfq_pool)
2280                 goto fail;
2281
2282         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2283                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2284         if (!cfq_ioc_pool)
2285                 goto fail;
2286
2287         return 0;
2288 fail:
2289         cfq_slab_kill();
2290         return -ENOMEM;
2291 }
2292
2293 /*
2294  * sysfs parts below -->
2295  */
2296 struct cfq_fs_entry {
2297         struct attribute attr;
2298         ssize_t (*show)(struct cfq_data *, char *);
2299         ssize_t (*store)(struct cfq_data *, const char *, size_t);
2300 };
2301
2302 static ssize_t
2303 cfq_var_show(unsigned int var, char *page)
2304 {
2305         return sprintf(page, "%d\n", var);
2306 }
2307
2308 static ssize_t
2309 cfq_var_store(unsigned int *var, const char *page, size_t count)
2310 {
2311         char *p = (char *) page;
2312
2313         *var = simple_strtoul(p, &p, 10);
2314         return count;
2315 }
2316
2317 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2318 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
2319 {                                                                       \
2320         unsigned int __data = __VAR;                                    \
2321         if (__CONV)                                                     \
2322                 __data = jiffies_to_msecs(__data);                      \
2323         return cfq_var_show(__data, (page));                            \
2324 }
2325 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2326 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2327 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2328 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2329 SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
2330 SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
2331 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2332 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2333 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2334 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2335 SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
2336 #undef SHOW_FUNCTION
2337
2338 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2339 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
2340 {                                                                       \
2341         unsigned int __data;                                            \
2342         int ret = cfq_var_store(&__data, (page), count);                \
2343         if (__data < (MIN))                                             \
2344                 __data = (MIN);                                         \
2345         else if (__data > (MAX))                                        \
2346                 __data = (MAX);                                         \
2347         if (__CONV)                                                     \
2348                 *(__PTR) = msecs_to_jiffies(__data);                    \
2349         else                                                            \
2350                 *(__PTR) = __data;                                      \
2351         return ret;                                                     \
2352 }
2353 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2354 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2355 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2356 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2357 STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2358 STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2359 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2360 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2361 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2362 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2363 STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
2364 #undef STORE_FUNCTION
2365
2366 static struct cfq_fs_entry cfq_quantum_entry = {
2367         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
2368         .show = cfq_quantum_show,
2369         .store = cfq_quantum_store,
2370 };
2371 static struct cfq_fs_entry cfq_queued_entry = {
2372         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
2373         .show = cfq_queued_show,
2374         .store = cfq_queued_store,
2375 };
2376 static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
2377         .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
2378         .show = cfq_fifo_expire_sync_show,
2379         .store = cfq_fifo_expire_sync_store,
2380 };
2381 static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
2382         .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
2383         .show = cfq_fifo_expire_async_show,
2384         .store = cfq_fifo_expire_async_store,
2385 };
2386 static struct cfq_fs_entry cfq_back_max_entry = {
2387         .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
2388         .show = cfq_back_max_show,
2389         .store = cfq_back_max_store,
2390 };
2391 static struct cfq_fs_entry cfq_back_penalty_entry = {
2392         .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
2393         .show = cfq_back_penalty_show,
2394         .store = cfq_back_penalty_store,
2395 };
2396 static struct cfq_fs_entry cfq_slice_sync_entry = {
2397         .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
2398         .show = cfq_slice_sync_show,
2399         .store = cfq_slice_sync_store,
2400 };
2401 static struct cfq_fs_entry cfq_slice_async_entry = {
2402         .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
2403         .show = cfq_slice_async_show,
2404         .store = cfq_slice_async_store,
2405 };
2406 static struct cfq_fs_entry cfq_slice_async_rq_entry = {
2407         .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
2408         .show = cfq_slice_async_rq_show,
2409         .store = cfq_slice_async_rq_store,
2410 };
2411 static struct cfq_fs_entry cfq_slice_idle_entry = {
2412         .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
2413         .show = cfq_slice_idle_show,
2414         .store = cfq_slice_idle_store,
2415 };
2416 static struct cfq_fs_entry cfq_max_depth_entry = {
2417         .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
2418         .show = cfq_max_depth_show,
2419         .store = cfq_max_depth_store,
2420 };
2421
2422 static struct attribute *default_attrs[] = {
2423         &cfq_quantum_entry.attr,
2424         &cfq_queued_entry.attr,
2425         &cfq_fifo_expire_sync_entry.attr,
2426         &cfq_fifo_expire_async_entry.attr,
2427         &cfq_back_max_entry.attr,
2428         &cfq_back_penalty_entry.attr,
2429         &cfq_slice_sync_entry.attr,
2430         &cfq_slice_async_entry.attr,
2431         &cfq_slice_async_rq_entry.attr,
2432         &cfq_slice_idle_entry.attr,
2433         &cfq_max_depth_entry.attr,
2434         NULL,
2435 };
2436
2437 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
2438
2439 static ssize_t
2440 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2441 {
2442         elevator_t *e = container_of(kobj, elevator_t, kobj);
2443         struct cfq_fs_entry *entry = to_cfq(attr);
2444
2445         if (!entry->show)
2446                 return -EIO;
2447
2448         return entry->show(e->elevator_data, page);
2449 }
2450
2451 static ssize_t
2452 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
2453                const char *page, size_t length)
2454 {
2455         elevator_t *e = container_of(kobj, elevator_t, kobj);
2456         struct cfq_fs_entry *entry = to_cfq(attr);
2457
2458         if (!entry->store)
2459                 return -EIO;
2460
2461         return entry->store(e->elevator_data, page, length);
2462 }
2463
2464 static struct sysfs_ops cfq_sysfs_ops = {
2465         .show   = cfq_attr_show,
2466         .store  = cfq_attr_store,
2467 };
2468
2469 static struct kobj_type cfq_ktype = {
2470         .sysfs_ops      = &cfq_sysfs_ops,
2471         .default_attrs  = default_attrs,
2472 };
2473
2474 static struct elevator_type iosched_cfq = {
2475         .ops = {
2476                 .elevator_merge_fn =            cfq_merge,
2477                 .elevator_merged_fn =           cfq_merged_request,
2478                 .elevator_merge_req_fn =        cfq_merged_requests,
2479                 .elevator_dispatch_fn =         cfq_dispatch_requests,
2480                 .elevator_add_req_fn =          cfq_insert_request,
2481                 .elevator_activate_req_fn =     cfq_activate_request,
2482                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
2483                 .elevator_queue_empty_fn =      cfq_queue_empty,
2484                 .elevator_completed_req_fn =    cfq_completed_request,
2485                 .elevator_former_req_fn =       cfq_former_request,
2486                 .elevator_latter_req_fn =       cfq_latter_request,
2487                 .elevator_set_req_fn =          cfq_set_request,
2488                 .elevator_put_req_fn =          cfq_put_request,
2489                 .elevator_may_queue_fn =        cfq_may_queue,
2490                 .elevator_init_fn =             cfq_init_queue,
2491                 .elevator_exit_fn =             cfq_exit_queue,
2492                 .trim =                         cfq_trim,
2493         },
2494         .elevator_ktype =       &cfq_ktype,
2495         .elevator_name =        "cfq",
2496         .elevator_owner =       THIS_MODULE,
2497 };
2498
2499 static int __init cfq_init(void)
2500 {
2501         int ret;
2502
2503         /*
2504          * could be 0 on HZ < 1000 setups
2505          */
2506         if (!cfq_slice_async)
2507                 cfq_slice_async = 1;
2508         if (!cfq_slice_idle)
2509                 cfq_slice_idle = 1;
2510
2511         if (cfq_slab_setup())
2512                 return -ENOMEM;
2513
2514         ret = elv_register(&iosched_cfq);
2515         if (ret)
2516                 cfq_slab_kill();
2517
2518         return ret;
2519 }
2520
2521 static void __exit cfq_exit(void)
2522 {
2523         DECLARE_COMPLETION(all_gone);
2524         elv_unregister(&iosched_cfq);
2525         ioc_gone = &all_gone;
2526         barrier();
2527         if (atomic_read(&ioc_count))
2528                 complete(ioc_gone);
2529         synchronize_rcu();
2530         cfq_slab_kill();
2531 }
2532
2533 module_init(cfq_init);
2534 module_exit(cfq_exit);
2535
2536 MODULE_AUTHOR("Jens Axboe");
2537 MODULE_LICENSE("GPL");
2538 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");