7102bafc98b39faa70cbf25ade4edeee011d277f
[safe/jmp/linux-2.6] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
8  */
9 #include <linux/kernel.h>
10 #include <linux/fs.h>
11 #include <linux/blkdev.h>
12 #include <linux/elevator.h>
13 #include <linux/bio.h>
14 #include <linux/config.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/init.h>
18 #include <linux/compiler.h>
19 #include <linux/hash.h>
20 #include <linux/rbtree.h>
21 #include <linux/mempool.h>
22 #include <linux/ioprio.h>
23 #include <linux/writeback.h>
24
25 /*
26  * tunables
27  */
28 static const int cfq_quantum = 4;               /* max queue in one round of service */
29 static const int cfq_queued = 8;                /* minimum rq allocate limit per-queue*/
30 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
31 static const int cfq_back_max = 16 * 1024;      /* maximum backwards seek, in KiB */
32 static const int cfq_back_penalty = 2;          /* penalty of a backwards seek */
33
34 static const int cfq_slice_sync = HZ / 10;
35 static int cfq_slice_async = HZ / 25;
36 static const int cfq_slice_async_rq = 2;
37 static int cfq_slice_idle = HZ / 100;
38
39 #define CFQ_IDLE_GRACE          (HZ / 10)
40 #define CFQ_SLICE_SCALE         (5)
41
42 #define CFQ_KEY_ASYNC           (0)
43 #define CFQ_KEY_ANY             (0xffff)
44
45 /*
46  * disable queueing at the driver/hardware level
47  */
48 static const int cfq_max_depth = 2;
49
50 static DEFINE_RWLOCK(cfq_exit_lock);
51
52 /*
53  * for the hash of cfqq inside the cfqd
54  */
55 #define CFQ_QHASH_SHIFT         6
56 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
57 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
58
59 /*
60  * for the hash of crq inside the cfqq
61  */
62 #define CFQ_MHASH_SHIFT         6
63 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
64 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
65 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
66 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
67 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
68
69 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
70 #define list_entry_fifo(ptr)    list_entry((ptr), struct request, queuelist)
71
72 #define RQ_DATA(rq)             (rq)->elevator_private
73
74 /*
75  * rb-tree defines
76  */
77 #define RB_NONE                 (2)
78 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
79 #define RB_CLEAR_COLOR(node)    (node)->rb_color = RB_NONE
80 #define RB_CLEAR(node)          do {    \
81         (node)->rb_parent = NULL;       \
82         RB_CLEAR_COLOR((node));         \
83         (node)->rb_right = NULL;        \
84         (node)->rb_left = NULL;         \
85 } while (0)
86 #define RB_CLEAR_ROOT(root)     ((root)->rb_node = NULL)
87 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
88 #define rq_rb_key(rq)           (rq)->sector
89
90 static kmem_cache_t *crq_pool;
91 static kmem_cache_t *cfq_pool;
92 static kmem_cache_t *cfq_ioc_pool;
93
94 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
95 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
96 #define cfq_class_be(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
97 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
98
99 #define ASYNC                   (0)
100 #define SYNC                    (1)
101
102 #define cfq_cfqq_dispatched(cfqq)       \
103         ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
104
105 #define cfq_cfqq_class_sync(cfqq)       ((cfqq)->key != CFQ_KEY_ASYNC)
106
107 #define cfq_cfqq_sync(cfqq)             \
108         (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
109
110 /*
111  * Per block device queue structure
112  */
113 struct cfq_data {
114         atomic_t ref;
115         request_queue_t *queue;
116
117         /*
118          * rr list of queues with requests and the count of them
119          */
120         struct list_head rr_list[CFQ_PRIO_LISTS];
121         struct list_head busy_rr;
122         struct list_head cur_rr;
123         struct list_head idle_rr;
124         unsigned int busy_queues;
125
126         /*
127          * non-ordered list of empty cfqq's
128          */
129         struct list_head empty_list;
130
131         /*
132          * cfqq lookup hash
133          */
134         struct hlist_head *cfq_hash;
135
136         /*
137          * global crq hash for all queues
138          */
139         struct hlist_head *crq_hash;
140
141         unsigned int max_queued;
142
143         mempool_t *crq_pool;
144
145         int rq_in_driver;
146
147         /*
148          * schedule slice state info
149          */
150         /*
151          * idle window management
152          */
153         struct timer_list idle_slice_timer;
154         struct work_struct unplug_work;
155
156         struct cfq_queue *active_queue;
157         struct cfq_io_context *active_cic;
158         int cur_prio, cur_end_prio;
159         unsigned int dispatch_slice;
160
161         struct timer_list idle_class_timer;
162
163         sector_t last_sector;
164         unsigned long last_end_request;
165
166         unsigned int rq_starved;
167
168         /*
169          * tunables, see top of file
170          */
171         unsigned int cfq_quantum;
172         unsigned int cfq_queued;
173         unsigned int cfq_fifo_expire[2];
174         unsigned int cfq_back_penalty;
175         unsigned int cfq_back_max;
176         unsigned int cfq_slice[2];
177         unsigned int cfq_slice_async_rq;
178         unsigned int cfq_slice_idle;
179         unsigned int cfq_max_depth;
180
181         struct list_head cic_list;
182 };
183
184 /*
185  * Per process-grouping structure
186  */
187 struct cfq_queue {
188         /* reference count */
189         atomic_t ref;
190         /* parent cfq_data */
191         struct cfq_data *cfqd;
192         /* cfqq lookup hash */
193         struct hlist_node cfq_hash;
194         /* hash key */
195         unsigned int key;
196         /* on either rr or empty list of cfqd */
197         struct list_head cfq_list;
198         /* sorted list of pending requests */
199         struct rb_root sort_list;
200         /* if fifo isn't expired, next request to serve */
201         struct cfq_rq *next_crq;
202         /* requests queued in sort_list */
203         int queued[2];
204         /* currently allocated requests */
205         int allocated[2];
206         /* fifo list of requests in sort_list */
207         struct list_head fifo;
208
209         unsigned long slice_start;
210         unsigned long slice_end;
211         unsigned long slice_left;
212         unsigned long service_last;
213
214         /* number of requests that are on the dispatch list */
215         int on_dispatch[2];
216
217         /* io prio of this group */
218         unsigned short ioprio, org_ioprio;
219         unsigned short ioprio_class, org_ioprio_class;
220
221         /* various state flags, see below */
222         unsigned int flags;
223 };
224
225 struct cfq_rq {
226         struct rb_node rb_node;
227         sector_t rb_key;
228         struct request *request;
229         struct hlist_node hash;
230
231         struct cfq_queue *cfq_queue;
232         struct cfq_io_context *io_context;
233
234         unsigned int crq_flags;
235 };
236
237 enum cfqq_state_flags {
238         CFQ_CFQQ_FLAG_on_rr = 0,
239         CFQ_CFQQ_FLAG_wait_request,
240         CFQ_CFQQ_FLAG_must_alloc,
241         CFQ_CFQQ_FLAG_must_alloc_slice,
242         CFQ_CFQQ_FLAG_must_dispatch,
243         CFQ_CFQQ_FLAG_fifo_expire,
244         CFQ_CFQQ_FLAG_idle_window,
245         CFQ_CFQQ_FLAG_prio_changed,
246 };
247
248 #define CFQ_CFQQ_FNS(name)                                              \
249 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
250 {                                                                       \
251         cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);                     \
252 }                                                                       \
253 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
254 {                                                                       \
255         cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                    \
256 }                                                                       \
257 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
258 {                                                                       \
259         return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;        \
260 }
261
262 CFQ_CFQQ_FNS(on_rr);
263 CFQ_CFQQ_FNS(wait_request);
264 CFQ_CFQQ_FNS(must_alloc);
265 CFQ_CFQQ_FNS(must_alloc_slice);
266 CFQ_CFQQ_FNS(must_dispatch);
267 CFQ_CFQQ_FNS(fifo_expire);
268 CFQ_CFQQ_FNS(idle_window);
269 CFQ_CFQQ_FNS(prio_changed);
270 #undef CFQ_CFQQ_FNS
271
272 enum cfq_rq_state_flags {
273         CFQ_CRQ_FLAG_is_sync = 0,
274 };
275
276 #define CFQ_CRQ_FNS(name)                                               \
277 static inline void cfq_mark_crq_##name(struct cfq_rq *crq)              \
278 {                                                                       \
279         crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);                   \
280 }                                                                       \
281 static inline void cfq_clear_crq_##name(struct cfq_rq *crq)             \
282 {                                                                       \
283         crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);                  \
284 }                                                                       \
285 static inline int cfq_crq_##name(const struct cfq_rq *crq)              \
286 {                                                                       \
287         return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;      \
288 }
289
290 CFQ_CRQ_FNS(is_sync);
291 #undef CFQ_CRQ_FNS
292
293 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
294 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
295 static void cfq_put_cfqd(struct cfq_data *cfqd);
296
297 #define process_sync(tsk)       ((tsk)->flags & PF_SYNCWRITE)
298
299 /*
300  * lots of deadline iosched dupes, can be abstracted later...
301  */
302 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
303 {
304         hlist_del_init(&crq->hash);
305 }
306
307 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
308 {
309         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
310
311         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
312 }
313
314 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
315 {
316         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
317         struct hlist_node *entry, *next;
318
319         hlist_for_each_safe(entry, next, hash_list) {
320                 struct cfq_rq *crq = list_entry_hash(entry);
321                 struct request *__rq = crq->request;
322
323                 if (!rq_mergeable(__rq)) {
324                         cfq_del_crq_hash(crq);
325                         continue;
326                 }
327
328                 if (rq_hash_key(__rq) == offset)
329                         return __rq;
330         }
331
332         return NULL;
333 }
334
335 /*
336  * scheduler run of queue, if there are requests pending and no one in the
337  * driver that will restart queueing
338  */
339 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
340 {
341         if (cfqd->busy_queues)
342                 kblockd_schedule_work(&cfqd->unplug_work);
343 }
344
345 static int cfq_queue_empty(request_queue_t *q)
346 {
347         struct cfq_data *cfqd = q->elevator->elevator_data;
348
349         return !cfqd->busy_queues;
350 }
351
352 /*
353  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
354  * We choose the request that is closest to the head right now. Distance
355  * behind the head are penalized and only allowed to a certain extent.
356  */
357 static struct cfq_rq *
358 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
359 {
360         sector_t last, s1, s2, d1 = 0, d2 = 0;
361         int r1_wrap = 0, r2_wrap = 0;   /* requests are behind the disk head */
362         unsigned long back_max;
363
364         if (crq1 == NULL || crq1 == crq2)
365                 return crq2;
366         if (crq2 == NULL)
367                 return crq1;
368
369         if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
370                 return crq1;
371         else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
372                 return crq2;
373
374         s1 = crq1->request->sector;
375         s2 = crq2->request->sector;
376
377         last = cfqd->last_sector;
378
379         /*
380          * by definition, 1KiB is 2 sectors
381          */
382         back_max = cfqd->cfq_back_max * 2;
383
384         /*
385          * Strict one way elevator _except_ in the case where we allow
386          * short backward seeks which are biased as twice the cost of a
387          * similar forward seek.
388          */
389         if (s1 >= last)
390                 d1 = s1 - last;
391         else if (s1 + back_max >= last)
392                 d1 = (last - s1) * cfqd->cfq_back_penalty;
393         else
394                 r1_wrap = 1;
395
396         if (s2 >= last)
397                 d2 = s2 - last;
398         else if (s2 + back_max >= last)
399                 d2 = (last - s2) * cfqd->cfq_back_penalty;
400         else
401                 r2_wrap = 1;
402
403         /* Found required data */
404         if (!r1_wrap && r2_wrap)
405                 return crq1;
406         else if (!r2_wrap && r1_wrap)
407                 return crq2;
408         else if (r1_wrap && r2_wrap) {
409                 /* both behind the head */
410                 if (s1 <= s2)
411                         return crq1;
412                 else
413                         return crq2;
414         }
415
416         /* Both requests in front of the head */
417         if (d1 < d2)
418                 return crq1;
419         else if (d2 < d1)
420                 return crq2;
421         else {
422                 if (s1 >= s2)
423                         return crq1;
424                 else
425                         return crq2;
426         }
427 }
428
429 /*
430  * would be nice to take fifo expire time into account as well
431  */
432 static struct cfq_rq *
433 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
434                   struct cfq_rq *last)
435 {
436         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
437         struct rb_node *rbnext, *rbprev;
438
439         if (!(rbnext = rb_next(&last->rb_node))) {
440                 rbnext = rb_first(&cfqq->sort_list);
441                 if (rbnext == &last->rb_node)
442                         rbnext = NULL;
443         }
444
445         rbprev = rb_prev(&last->rb_node);
446
447         if (rbprev)
448                 crq_prev = rb_entry_crq(rbprev);
449         if (rbnext)
450                 crq_next = rb_entry_crq(rbnext);
451
452         return cfq_choose_req(cfqd, crq_next, crq_prev);
453 }
454
455 static void cfq_update_next_crq(struct cfq_rq *crq)
456 {
457         struct cfq_queue *cfqq = crq->cfq_queue;
458
459         if (cfqq->next_crq == crq)
460                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
461 }
462
463 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
464 {
465         struct cfq_data *cfqd = cfqq->cfqd;
466         struct list_head *list, *entry;
467
468         BUG_ON(!cfq_cfqq_on_rr(cfqq));
469
470         list_del(&cfqq->cfq_list);
471
472         if (cfq_class_rt(cfqq))
473                 list = &cfqd->cur_rr;
474         else if (cfq_class_idle(cfqq))
475                 list = &cfqd->idle_rr;
476         else {
477                 /*
478                  * if cfqq has requests in flight, don't allow it to be
479                  * found in cfq_set_active_queue before it has finished them.
480                  * this is done to increase fairness between a process that
481                  * has lots of io pending vs one that only generates one
482                  * sporadically or synchronously
483                  */
484                 if (cfq_cfqq_dispatched(cfqq))
485                         list = &cfqd->busy_rr;
486                 else
487                         list = &cfqd->rr_list[cfqq->ioprio];
488         }
489
490         /*
491          * if queue was preempted, just add to front to be fair. busy_rr
492          * isn't sorted.
493          */
494         if (preempted || list == &cfqd->busy_rr) {
495                 list_add(&cfqq->cfq_list, list);
496                 return;
497         }
498
499         /*
500          * sort by when queue was last serviced
501          */
502         entry = list;
503         while ((entry = entry->prev) != list) {
504                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
505
506                 if (!__cfqq->service_last)
507                         break;
508                 if (time_before(__cfqq->service_last, cfqq->service_last))
509                         break;
510         }
511
512         list_add(&cfqq->cfq_list, entry);
513 }
514
515 /*
516  * add to busy list of queues for service, trying to be fair in ordering
517  * the pending list according to last request service
518  */
519 static inline void
520 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
521 {
522         BUG_ON(cfq_cfqq_on_rr(cfqq));
523         cfq_mark_cfqq_on_rr(cfqq);
524         cfqd->busy_queues++;
525
526         cfq_resort_rr_list(cfqq, 0);
527 }
528
529 static inline void
530 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
531 {
532         BUG_ON(!cfq_cfqq_on_rr(cfqq));
533         cfq_clear_cfqq_on_rr(cfqq);
534         list_move(&cfqq->cfq_list, &cfqd->empty_list);
535
536         BUG_ON(!cfqd->busy_queues);
537         cfqd->busy_queues--;
538 }
539
540 /*
541  * rb tree support functions
542  */
543 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
544 {
545         struct cfq_queue *cfqq = crq->cfq_queue;
546         struct cfq_data *cfqd = cfqq->cfqd;
547         const int sync = cfq_crq_is_sync(crq);
548
549         BUG_ON(!cfqq->queued[sync]);
550         cfqq->queued[sync]--;
551
552         cfq_update_next_crq(crq);
553
554         rb_erase(&crq->rb_node, &cfqq->sort_list);
555         RB_CLEAR_COLOR(&crq->rb_node);
556
557         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
558                 cfq_del_cfqq_rr(cfqd, cfqq);
559 }
560
561 static struct cfq_rq *
562 __cfq_add_crq_rb(struct cfq_rq *crq)
563 {
564         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
565         struct rb_node *parent = NULL;
566         struct cfq_rq *__crq;
567
568         while (*p) {
569                 parent = *p;
570                 __crq = rb_entry_crq(parent);
571
572                 if (crq->rb_key < __crq->rb_key)
573                         p = &(*p)->rb_left;
574                 else if (crq->rb_key > __crq->rb_key)
575                         p = &(*p)->rb_right;
576                 else
577                         return __crq;
578         }
579
580         rb_link_node(&crq->rb_node, parent, p);
581         return NULL;
582 }
583
584 static void cfq_add_crq_rb(struct cfq_rq *crq)
585 {
586         struct cfq_queue *cfqq = crq->cfq_queue;
587         struct cfq_data *cfqd = cfqq->cfqd;
588         struct request *rq = crq->request;
589         struct cfq_rq *__alias;
590
591         crq->rb_key = rq_rb_key(rq);
592         cfqq->queued[cfq_crq_is_sync(crq)]++;
593
594         /*
595          * looks a little odd, but the first insert might return an alias.
596          * if that happens, put the alias on the dispatch list
597          */
598         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
599                 cfq_dispatch_insert(cfqd->queue, __alias);
600
601         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
602
603         if (!cfq_cfqq_on_rr(cfqq))
604                 cfq_add_cfqq_rr(cfqd, cfqq);
605
606         /*
607          * check if this request is a better next-serve candidate
608          */
609         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
610 }
611
612 static inline void
613 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
614 {
615         rb_erase(&crq->rb_node, &cfqq->sort_list);
616         cfqq->queued[cfq_crq_is_sync(crq)]--;
617
618         cfq_add_crq_rb(crq);
619 }
620
621 static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
622
623 {
624         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
625         struct rb_node *n;
626
627         if (!cfqq)
628                 goto out;
629
630         n = cfqq->sort_list.rb_node;
631         while (n) {
632                 struct cfq_rq *crq = rb_entry_crq(n);
633
634                 if (sector < crq->rb_key)
635                         n = n->rb_left;
636                 else if (sector > crq->rb_key)
637                         n = n->rb_right;
638                 else
639                         return crq->request;
640         }
641
642 out:
643         return NULL;
644 }
645
646 static void cfq_activate_request(request_queue_t *q, struct request *rq)
647 {
648         struct cfq_data *cfqd = q->elevator->elevator_data;
649
650         cfqd->rq_in_driver++;
651 }
652
653 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
654 {
655         struct cfq_data *cfqd = q->elevator->elevator_data;
656
657         WARN_ON(!cfqd->rq_in_driver);
658         cfqd->rq_in_driver--;
659 }
660
661 static void cfq_remove_request(struct request *rq)
662 {
663         struct cfq_rq *crq = RQ_DATA(rq);
664
665         list_del_init(&rq->queuelist);
666         cfq_del_crq_rb(crq);
667         cfq_del_crq_hash(crq);
668 }
669
670 static int
671 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
672 {
673         struct cfq_data *cfqd = q->elevator->elevator_data;
674         struct request *__rq;
675         int ret;
676
677         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
678         if (__rq && elv_rq_merge_ok(__rq, bio)) {
679                 ret = ELEVATOR_BACK_MERGE;
680                 goto out;
681         }
682
683         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
684         if (__rq && elv_rq_merge_ok(__rq, bio)) {
685                 ret = ELEVATOR_FRONT_MERGE;
686                 goto out;
687         }
688
689         return ELEVATOR_NO_MERGE;
690 out:
691         *req = __rq;
692         return ret;
693 }
694
695 static void cfq_merged_request(request_queue_t *q, struct request *req)
696 {
697         struct cfq_data *cfqd = q->elevator->elevator_data;
698         struct cfq_rq *crq = RQ_DATA(req);
699
700         cfq_del_crq_hash(crq);
701         cfq_add_crq_hash(cfqd, crq);
702
703         if (rq_rb_key(req) != crq->rb_key) {
704                 struct cfq_queue *cfqq = crq->cfq_queue;
705
706                 cfq_update_next_crq(crq);
707                 cfq_reposition_crq_rb(cfqq, crq);
708         }
709 }
710
711 static void
712 cfq_merged_requests(request_queue_t *q, struct request *rq,
713                     struct request *next)
714 {
715         cfq_merged_request(q, rq);
716
717         /*
718          * reposition in fifo if next is older than rq
719          */
720         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
721             time_before(next->start_time, rq->start_time))
722                 list_move(&rq->queuelist, &next->queuelist);
723
724         cfq_remove_request(next);
725 }
726
727 static inline void
728 __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
729 {
730         if (cfqq) {
731                 /*
732                  * stop potential idle class queues waiting service
733                  */
734                 del_timer(&cfqd->idle_class_timer);
735
736                 cfqq->slice_start = jiffies;
737                 cfqq->slice_end = 0;
738                 cfqq->slice_left = 0;
739                 cfq_clear_cfqq_must_alloc_slice(cfqq);
740                 cfq_clear_cfqq_fifo_expire(cfqq);
741         }
742
743         cfqd->active_queue = cfqq;
744 }
745
746 /*
747  * current cfqq expired its slice (or was too idle), select new one
748  */
749 static void
750 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
751                     int preempted)
752 {
753         unsigned long now = jiffies;
754
755         if (cfq_cfqq_wait_request(cfqq))
756                 del_timer(&cfqd->idle_slice_timer);
757
758         if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
759                 cfqq->service_last = now;
760                 cfq_schedule_dispatch(cfqd);
761         }
762
763         cfq_clear_cfqq_must_dispatch(cfqq);
764         cfq_clear_cfqq_wait_request(cfqq);
765
766         /*
767          * store what was left of this slice, if the queue idled out
768          * or was preempted
769          */
770         if (time_after(cfqq->slice_end, now))
771                 cfqq->slice_left = cfqq->slice_end - now;
772         else
773                 cfqq->slice_left = 0;
774
775         if (cfq_cfqq_on_rr(cfqq))
776                 cfq_resort_rr_list(cfqq, preempted);
777
778         if (cfqq == cfqd->active_queue)
779                 cfqd->active_queue = NULL;
780
781         if (cfqd->active_cic) {
782                 put_io_context(cfqd->active_cic->ioc);
783                 cfqd->active_cic = NULL;
784         }
785
786         cfqd->dispatch_slice = 0;
787 }
788
789 static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
790 {
791         struct cfq_queue *cfqq = cfqd->active_queue;
792
793         if (cfqq)
794                 __cfq_slice_expired(cfqd, cfqq, preempted);
795 }
796
797 /*
798  * 0
799  * 0,1
800  * 0,1,2
801  * 0,1,2,3
802  * 0,1,2,3,4
803  * 0,1,2,3,4,5
804  * 0,1,2,3,4,5,6
805  * 0,1,2,3,4,5,6,7
806  */
807 static int cfq_get_next_prio_level(struct cfq_data *cfqd)
808 {
809         int prio, wrap;
810
811         prio = -1;
812         wrap = 0;
813         do {
814                 int p;
815
816                 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
817                         if (!list_empty(&cfqd->rr_list[p])) {
818                                 prio = p;
819                                 break;
820                         }
821                 }
822
823                 if (prio != -1)
824                         break;
825                 cfqd->cur_prio = 0;
826                 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
827                         cfqd->cur_end_prio = 0;
828                         if (wrap)
829                                 break;
830                         wrap = 1;
831                 }
832         } while (1);
833
834         if (unlikely(prio == -1))
835                 return -1;
836
837         BUG_ON(prio >= CFQ_PRIO_LISTS);
838
839         list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
840
841         cfqd->cur_prio = prio + 1;
842         if (cfqd->cur_prio > cfqd->cur_end_prio) {
843                 cfqd->cur_end_prio = cfqd->cur_prio;
844                 cfqd->cur_prio = 0;
845         }
846         if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
847                 cfqd->cur_prio = 0;
848                 cfqd->cur_end_prio = 0;
849         }
850
851         return prio;
852 }
853
854 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
855 {
856         struct cfq_queue *cfqq = NULL;
857
858         /*
859          * if current list is non-empty, grab first entry. if it is empty,
860          * get next prio level and grab first entry then if any are spliced
861          */
862         if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
863                 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
864
865         /*
866          * if we have idle queues and no rt or be queues had pending
867          * requests, either allow immediate service if the grace period
868          * has passed or arm the idle grace timer
869          */
870         if (!cfqq && !list_empty(&cfqd->idle_rr)) {
871                 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
872
873                 if (time_after_eq(jiffies, end))
874                         cfqq = list_entry_cfqq(cfqd->idle_rr.next);
875                 else
876                         mod_timer(&cfqd->idle_class_timer, end);
877         }
878
879         __cfq_set_active_queue(cfqd, cfqq);
880         return cfqq;
881 }
882
883 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
884
885 {
886         unsigned long sl;
887
888         WARN_ON(!RB_EMPTY(&cfqq->sort_list));
889         WARN_ON(cfqq != cfqd->active_queue);
890
891         /*
892          * idle is disabled, either manually or by past process history
893          */
894         if (!cfqd->cfq_slice_idle)
895                 return 0;
896         if (!cfq_cfqq_idle_window(cfqq))
897                 return 0;
898         /*
899          * task has exited, don't wait
900          */
901         if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
902                 return 0;
903
904         cfq_mark_cfqq_must_dispatch(cfqq);
905         cfq_mark_cfqq_wait_request(cfqq);
906
907         sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
908         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
909         return 1;
910 }
911
912 static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
913 {
914         struct cfq_data *cfqd = q->elevator->elevator_data;
915         struct cfq_queue *cfqq = crq->cfq_queue;
916
917         cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
918         cfq_remove_request(crq->request);
919         cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
920         elv_dispatch_sort(q, crq->request);
921 }
922
923 /*
924  * return expired entry, or NULL to just start from scratch in rbtree
925  */
926 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
927 {
928         struct cfq_data *cfqd = cfqq->cfqd;
929         struct request *rq;
930         struct cfq_rq *crq;
931
932         if (cfq_cfqq_fifo_expire(cfqq))
933                 return NULL;
934
935         if (!list_empty(&cfqq->fifo)) {
936                 int fifo = cfq_cfqq_class_sync(cfqq);
937
938                 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
939                 rq = crq->request;
940                 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
941                         cfq_mark_cfqq_fifo_expire(cfqq);
942                         return crq;
943                 }
944         }
945
946         return NULL;
947 }
948
949 /*
950  * Scale schedule slice based on io priority. Use the sync time slice only
951  * if a queue is marked sync and has sync io queued. A sync queue with async
952  * io only, should not get full sync slice length.
953  */
954 static inline int
955 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
956 {
957         const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
958
959         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
960
961         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
962 }
963
964 static inline void
965 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
966 {
967         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
968 }
969
970 static inline int
971 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
972 {
973         const int base_rq = cfqd->cfq_slice_async_rq;
974
975         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
976
977         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
978 }
979
980 /*
981  * get next queue for service
982  */
983 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
984 {
985         unsigned long now = jiffies;
986         struct cfq_queue *cfqq;
987
988         cfqq = cfqd->active_queue;
989         if (!cfqq)
990                 goto new_queue;
991
992         /*
993          * slice has expired
994          */
995         if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
996                 goto expire;
997
998         /*
999          * if queue has requests, dispatch one. if not, check if
1000          * enough slice is left to wait for one
1001          */
1002         if (!RB_EMPTY(&cfqq->sort_list))
1003                 goto keep_queue;
1004         else if (cfq_cfqq_class_sync(cfqq) &&
1005                  time_before(now, cfqq->slice_end)) {
1006                 if (cfq_arm_slice_timer(cfqd, cfqq))
1007                         return NULL;
1008         }
1009
1010 expire:
1011         cfq_slice_expired(cfqd, 0);
1012 new_queue:
1013         cfqq = cfq_set_active_queue(cfqd);
1014 keep_queue:
1015         return cfqq;
1016 }
1017
1018 static int
1019 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1020                         int max_dispatch)
1021 {
1022         int dispatched = 0;
1023
1024         BUG_ON(RB_EMPTY(&cfqq->sort_list));
1025
1026         do {
1027                 struct cfq_rq *crq;
1028
1029                 /*
1030                  * follow expired path, else get first next available
1031                  */
1032                 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1033                         crq = cfqq->next_crq;
1034
1035                 /*
1036                  * finally, insert request into driver dispatch list
1037                  */
1038                 cfq_dispatch_insert(cfqd->queue, crq);
1039
1040                 cfqd->dispatch_slice++;
1041                 dispatched++;
1042
1043                 if (!cfqd->active_cic) {
1044                         atomic_inc(&crq->io_context->ioc->refcount);
1045                         cfqd->active_cic = crq->io_context;
1046                 }
1047
1048                 if (RB_EMPTY(&cfqq->sort_list))
1049                         break;
1050
1051         } while (dispatched < max_dispatch);
1052
1053         /*
1054          * if slice end isn't set yet, set it. if at least one request was
1055          * sync, use the sync time slice value
1056          */
1057         if (!cfqq->slice_end)
1058                 cfq_set_prio_slice(cfqd, cfqq);
1059
1060         /*
1061          * expire an async queue immediately if it has used up its slice. idle
1062          * queue always expire after 1 dispatch round.
1063          */
1064         if ((!cfq_cfqq_sync(cfqq) &&
1065             cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1066             cfq_class_idle(cfqq))
1067                 cfq_slice_expired(cfqd, 0);
1068
1069         return dispatched;
1070 }
1071
1072 static int
1073 cfq_forced_dispatch_cfqqs(struct list_head *list)
1074 {
1075         int dispatched = 0;
1076         struct cfq_queue *cfqq, *next;
1077         struct cfq_rq *crq;
1078
1079         list_for_each_entry_safe(cfqq, next, list, cfq_list) {
1080                 while ((crq = cfqq->next_crq)) {
1081                         cfq_dispatch_insert(cfqq->cfqd->queue, crq);
1082                         dispatched++;
1083                 }
1084                 BUG_ON(!list_empty(&cfqq->fifo));
1085         }
1086         return dispatched;
1087 }
1088
1089 static int
1090 cfq_forced_dispatch(struct cfq_data *cfqd)
1091 {
1092         int i, dispatched = 0;
1093
1094         for (i = 0; i < CFQ_PRIO_LISTS; i++)
1095                 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
1096
1097         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
1098         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1099         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1100
1101         cfq_slice_expired(cfqd, 0);
1102
1103         BUG_ON(cfqd->busy_queues);
1104
1105         return dispatched;
1106 }
1107
1108 static int
1109 cfq_dispatch_requests(request_queue_t *q, int force)
1110 {
1111         struct cfq_data *cfqd = q->elevator->elevator_data;
1112         struct cfq_queue *cfqq;
1113
1114         if (!cfqd->busy_queues)
1115                 return 0;
1116
1117         if (unlikely(force))
1118                 return cfq_forced_dispatch(cfqd);
1119
1120         cfqq = cfq_select_queue(cfqd);
1121         if (cfqq) {
1122                 int max_dispatch;
1123
1124                 /*
1125                  * if idle window is disabled, allow queue buildup
1126                  */
1127                 if (!cfq_cfqq_idle_window(cfqq) &&
1128                     cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1129                         return 0;
1130
1131                 cfq_clear_cfqq_must_dispatch(cfqq);
1132                 cfq_clear_cfqq_wait_request(cfqq);
1133                 del_timer(&cfqd->idle_slice_timer);
1134
1135                 max_dispatch = cfqd->cfq_quantum;
1136                 if (cfq_class_idle(cfqq))
1137                         max_dispatch = 1;
1138
1139                 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1140         }
1141
1142         return 0;
1143 }
1144
1145 /*
1146  * task holds one reference to the queue, dropped when task exits. each crq
1147  * in-flight on this queue also holds a reference, dropped when crq is freed.
1148  *
1149  * queue lock must be held here.
1150  */
1151 static void cfq_put_queue(struct cfq_queue *cfqq)
1152 {
1153         struct cfq_data *cfqd = cfqq->cfqd;
1154
1155         BUG_ON(atomic_read(&cfqq->ref) <= 0);
1156
1157         if (!atomic_dec_and_test(&cfqq->ref))
1158                 return;
1159
1160         BUG_ON(rb_first(&cfqq->sort_list));
1161         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1162         BUG_ON(cfq_cfqq_on_rr(cfqq));
1163
1164         if (unlikely(cfqd->active_queue == cfqq))
1165                 __cfq_slice_expired(cfqd, cfqq, 0);
1166
1167         cfq_put_cfqd(cfqq->cfqd);
1168
1169         /*
1170          * it's on the empty list and still hashed
1171          */
1172         list_del(&cfqq->cfq_list);
1173         hlist_del(&cfqq->cfq_hash);
1174         kmem_cache_free(cfq_pool, cfqq);
1175 }
1176
1177 static inline struct cfq_queue *
1178 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1179                     const int hashval)
1180 {
1181         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1182         struct hlist_node *entry, *next;
1183
1184         hlist_for_each_safe(entry, next, hash_list) {
1185                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
1186                 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1187
1188                 if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
1189                         return __cfqq;
1190         }
1191
1192         return NULL;
1193 }
1194
1195 static struct cfq_queue *
1196 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1197 {
1198         return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1199 }
1200
1201 static void cfq_free_io_context(struct cfq_io_context *cic)
1202 {
1203         struct cfq_io_context *__cic;
1204         struct list_head *entry, *next;
1205
1206         list_for_each_safe(entry, next, &cic->list) {
1207                 __cic = list_entry(entry, struct cfq_io_context, list);
1208                 kmem_cache_free(cfq_ioc_pool, __cic);
1209         }
1210
1211         kmem_cache_free(cfq_ioc_pool, cic);
1212 }
1213
1214 static void cfq_trim(struct io_context *ioc)
1215 {
1216         ioc->set_ioprio = NULL;
1217         if (ioc->cic)
1218                 cfq_free_io_context(ioc->cic);
1219 }
1220
1221 /*
1222  * Called with interrupts disabled
1223  */
1224 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1225 {
1226         struct cfq_data *cfqd = cic->key;
1227         request_queue_t *q;
1228
1229         if (!cfqd)
1230                 return;
1231
1232         q = cfqd->queue;
1233
1234         WARN_ON(!irqs_disabled());
1235
1236         spin_lock(q->queue_lock);
1237
1238         if (cic->cfqq[ASYNC]) {
1239                 if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
1240                         __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
1241                 cfq_put_queue(cic->cfqq[ASYNC]);
1242                 cic->cfqq[ASYNC] = NULL;
1243         }
1244
1245         if (cic->cfqq[SYNC]) {
1246                 if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
1247                         __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
1248                 cfq_put_queue(cic->cfqq[SYNC]);
1249                 cic->cfqq[SYNC] = NULL;
1250         }
1251
1252         cic->key = NULL;
1253         list_del_init(&cic->queue_list);
1254         spin_unlock(q->queue_lock);
1255 }
1256
1257 /*
1258  * Another task may update the task cic list, if it is doing a queue lookup
1259  * on its behalf. cfq_cic_lock excludes such concurrent updates
1260  */
1261 static void cfq_exit_io_context(struct cfq_io_context *cic)
1262 {
1263         struct cfq_io_context *__cic;
1264         struct list_head *entry;
1265         unsigned long flags;
1266
1267         local_irq_save(flags);
1268
1269         /*
1270          * put the reference this task is holding to the various queues
1271          */
1272         read_lock(&cfq_exit_lock);
1273         list_for_each(entry, &cic->list) {
1274                 __cic = list_entry(entry, struct cfq_io_context, list);
1275                 cfq_exit_single_io_context(__cic);
1276         }
1277
1278         cfq_exit_single_io_context(cic);
1279         read_unlock(&cfq_exit_lock);
1280         local_irq_restore(flags);
1281 }
1282
1283 static struct cfq_io_context *
1284 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1285 {
1286         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1287
1288         if (cic) {
1289                 INIT_LIST_HEAD(&cic->list);
1290                 cic->cfqq[ASYNC] = NULL;
1291                 cic->cfqq[SYNC] = NULL;
1292                 cic->key = NULL;
1293                 cic->last_end_request = jiffies;
1294                 cic->ttime_total = 0;
1295                 cic->ttime_samples = 0;
1296                 cic->ttime_mean = 0;
1297                 cic->dtor = cfq_free_io_context;
1298                 cic->exit = cfq_exit_io_context;
1299                 INIT_LIST_HEAD(&cic->queue_list);
1300         }
1301
1302         return cic;
1303 }
1304
1305 static void cfq_init_prio_data(struct cfq_queue *cfqq)
1306 {
1307         struct task_struct *tsk = current;
1308         int ioprio_class;
1309
1310         if (!cfq_cfqq_prio_changed(cfqq))
1311                 return;
1312
1313         ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1314         switch (ioprio_class) {
1315                 default:
1316                         printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1317                 case IOPRIO_CLASS_NONE:
1318                         /*
1319                          * no prio set, place us in the middle of the BE classes
1320                          */
1321                         cfqq->ioprio = task_nice_ioprio(tsk);
1322                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1323                         break;
1324                 case IOPRIO_CLASS_RT:
1325                         cfqq->ioprio = task_ioprio(tsk);
1326                         cfqq->ioprio_class = IOPRIO_CLASS_RT;
1327                         break;
1328                 case IOPRIO_CLASS_BE:
1329                         cfqq->ioprio = task_ioprio(tsk);
1330                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1331                         break;
1332                 case IOPRIO_CLASS_IDLE:
1333                         cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1334                         cfqq->ioprio = 7;
1335                         cfq_clear_cfqq_idle_window(cfqq);
1336                         break;
1337         }
1338
1339         /*
1340          * keep track of original prio settings in case we have to temporarily
1341          * elevate the priority of this queue
1342          */
1343         cfqq->org_ioprio = cfqq->ioprio;
1344         cfqq->org_ioprio_class = cfqq->ioprio_class;
1345
1346         if (cfq_cfqq_on_rr(cfqq))
1347                 cfq_resort_rr_list(cfqq, 0);
1348
1349         cfq_clear_cfqq_prio_changed(cfqq);
1350 }
1351
1352 static inline void changed_ioprio(struct cfq_io_context *cic)
1353 {
1354         struct cfq_data *cfqd = cic->key;
1355         struct cfq_queue *cfqq;
1356         if (cfqd) {
1357                 spin_lock(cfqd->queue->queue_lock);
1358                 cfqq = cic->cfqq[ASYNC];
1359                 if (cfqq) {
1360                         cfq_mark_cfqq_prio_changed(cfqq);
1361                         cfq_init_prio_data(cfqq);
1362                 }
1363                 cfqq = cic->cfqq[SYNC];
1364                 if (cfqq) {
1365                         cfq_mark_cfqq_prio_changed(cfqq);
1366                         cfq_init_prio_data(cfqq);
1367                 }
1368                 spin_unlock(cfqd->queue->queue_lock);
1369         }
1370 }
1371
1372 /*
1373  * callback from sys_ioprio_set, irqs are disabled
1374  */
1375 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1376 {
1377         struct cfq_io_context *cic;
1378
1379         write_lock(&cfq_exit_lock);
1380
1381         cic = ioc->cic;
1382
1383         changed_ioprio(cic);
1384
1385         list_for_each_entry(cic, &cic->list, list)
1386                 changed_ioprio(cic);
1387
1388         write_unlock(&cfq_exit_lock);
1389
1390         return 0;
1391 }
1392
1393 static struct cfq_queue *
1394 cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
1395               gfp_t gfp_mask)
1396 {
1397         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1398         struct cfq_queue *cfqq, *new_cfqq = NULL;
1399
1400 retry:
1401         cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1402
1403         if (!cfqq) {
1404                 if (new_cfqq) {
1405                         cfqq = new_cfqq;
1406                         new_cfqq = NULL;
1407                 } else if (gfp_mask & __GFP_WAIT) {
1408                         spin_unlock_irq(cfqd->queue->queue_lock);
1409                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1410                         spin_lock_irq(cfqd->queue->queue_lock);
1411                         goto retry;
1412                 } else {
1413                         cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1414                         if (!cfqq)
1415                                 goto out;
1416                 }
1417
1418                 memset(cfqq, 0, sizeof(*cfqq));
1419
1420                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1421                 INIT_LIST_HEAD(&cfqq->cfq_list);
1422                 RB_CLEAR_ROOT(&cfqq->sort_list);
1423                 INIT_LIST_HEAD(&cfqq->fifo);
1424
1425                 cfqq->key = key;
1426                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1427                 atomic_set(&cfqq->ref, 0);
1428                 cfqq->cfqd = cfqd;
1429                 atomic_inc(&cfqd->ref);
1430                 cfqq->service_last = 0;
1431                 /*
1432                  * set ->slice_left to allow preemption for a new process
1433                  */
1434                 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1435                 cfq_mark_cfqq_idle_window(cfqq);
1436                 cfq_mark_cfqq_prio_changed(cfqq);
1437                 cfq_init_prio_data(cfqq);
1438         }
1439
1440         if (new_cfqq)
1441                 kmem_cache_free(cfq_pool, new_cfqq);
1442
1443         atomic_inc(&cfqq->ref);
1444 out:
1445         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1446         return cfqq;
1447 }
1448
1449 /*
1450  * Setup general io context and cfq io context. There can be several cfq
1451  * io contexts per general io context, if this process is doing io to more
1452  * than one device managed by cfq. Note that caller is holding a reference to
1453  * cfqq, so we don't need to worry about it disappearing
1454  */
1455 static struct cfq_io_context *
1456 cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
1457 {
1458         struct io_context *ioc = NULL;
1459         struct cfq_io_context *cic;
1460
1461         might_sleep_if(gfp_mask & __GFP_WAIT);
1462
1463         ioc = get_io_context(gfp_mask);
1464         if (!ioc)
1465                 return NULL;
1466
1467 restart:
1468         if ((cic = ioc->cic) == NULL) {
1469                 cic = cfq_alloc_io_context(cfqd, gfp_mask);
1470
1471                 if (cic == NULL)
1472                         goto err;
1473
1474                 /*
1475                  * manually increment generic io_context usage count, it
1476                  * cannot go away since we are already holding one ref to it
1477                  */
1478                 cic->ioc = ioc;
1479                 cic->key = cfqd;
1480                 read_lock(&cfq_exit_lock);
1481                 ioc->set_ioprio = cfq_ioc_set_ioprio;
1482                 ioc->cic = cic;
1483                 list_add(&cic->queue_list, &cfqd->cic_list);
1484                 read_unlock(&cfq_exit_lock);
1485         } else {
1486                 struct cfq_io_context *__cic;
1487
1488                 /*
1489                  * the first cic on the list is actually the head itself
1490                  */
1491                 if (cic->key == cfqd)
1492                         goto out;
1493
1494                 if (unlikely(!cic->key)) {
1495                         read_lock(&cfq_exit_lock);
1496                         if (list_empty(&cic->list))
1497                                 ioc->cic = NULL;
1498                         else
1499                                 ioc->cic = list_entry(cic->list.next,
1500                                                       struct cfq_io_context,
1501                                                       list);
1502                         read_unlock(&cfq_exit_lock);
1503                         kmem_cache_free(cfq_ioc_pool, cic);
1504                         goto restart;
1505                 }
1506
1507                 /*
1508                  * cic exists, check if we already are there. linear search
1509                  * should be ok here, the list will usually not be more than
1510                  * 1 or a few entries long
1511                  */
1512                 list_for_each_entry(__cic, &cic->list, list) {
1513                         /*
1514                          * this process is already holding a reference to
1515                          * this queue, so no need to get one more
1516                          */
1517                         if (__cic->key == cfqd) {
1518                                 cic = __cic;
1519                                 goto out;
1520                         }
1521                         if (unlikely(!__cic->key)) {
1522                                 read_lock(&cfq_exit_lock);
1523                                 list_del(&__cic->list);
1524                                 read_unlock(&cfq_exit_lock);
1525                                 kmem_cache_free(cfq_ioc_pool, __cic);
1526                                 goto restart;
1527                         }
1528                 }
1529
1530                 /*
1531                  * nope, process doesn't have a cic assoicated with this
1532                  * cfqq yet. get a new one and add to list
1533                  */
1534                 __cic = cfq_alloc_io_context(cfqd, gfp_mask);
1535                 if (__cic == NULL)
1536                         goto err;
1537
1538                 __cic->ioc = ioc;
1539                 __cic->key = cfqd;
1540                 read_lock(&cfq_exit_lock);
1541                 list_add(&__cic->list, &cic->list);
1542                 list_add(&__cic->queue_list, &cfqd->cic_list);
1543                 read_unlock(&cfq_exit_lock);
1544                 cic = __cic;
1545         }
1546
1547 out:
1548         return cic;
1549 err:
1550         put_io_context(ioc);
1551         return NULL;
1552 }
1553
1554 static void
1555 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1556 {
1557         unsigned long elapsed, ttime;
1558
1559         /*
1560          * if this context already has stuff queued, thinktime is from
1561          * last queue not last end
1562          */
1563 #if 0
1564         if (time_after(cic->last_end_request, cic->last_queue))
1565                 elapsed = jiffies - cic->last_end_request;
1566         else
1567                 elapsed = jiffies - cic->last_queue;
1568 #else
1569                 elapsed = jiffies - cic->last_end_request;
1570 #endif
1571
1572         ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1573
1574         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1575         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1576         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1577 }
1578
1579 #define sample_valid(samples)   ((samples) > 80)
1580
1581 /*
1582  * Disable idle window if the process thinks too long or seeks so much that
1583  * it doesn't matter
1584  */
1585 static void
1586 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1587                        struct cfq_io_context *cic)
1588 {
1589         int enable_idle = cfq_cfqq_idle_window(cfqq);
1590
1591         if (!cic->ioc->task || !cfqd->cfq_slice_idle)
1592                 enable_idle = 0;
1593         else if (sample_valid(cic->ttime_samples)) {
1594                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1595                         enable_idle = 0;
1596                 else
1597                         enable_idle = 1;
1598         }
1599
1600         if (enable_idle)
1601                 cfq_mark_cfqq_idle_window(cfqq);
1602         else
1603                 cfq_clear_cfqq_idle_window(cfqq);
1604 }
1605
1606
1607 /*
1608  * Check if new_cfqq should preempt the currently active queue. Return 0 for
1609  * no or if we aren't sure, a 1 will cause a preempt.
1610  */
1611 static int
1612 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1613                    struct cfq_rq *crq)
1614 {
1615         struct cfq_queue *cfqq = cfqd->active_queue;
1616
1617         if (cfq_class_idle(new_cfqq))
1618                 return 0;
1619
1620         if (!cfqq)
1621                 return 1;
1622
1623         if (cfq_class_idle(cfqq))
1624                 return 1;
1625         if (!cfq_cfqq_wait_request(new_cfqq))
1626                 return 0;
1627         /*
1628          * if it doesn't have slice left, forget it
1629          */
1630         if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1631                 return 0;
1632         if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1633                 return 1;
1634
1635         return 0;
1636 }
1637
1638 /*
1639  * cfqq preempts the active queue. if we allowed preempt with no slice left,
1640  * let it have half of its nominal slice.
1641  */
1642 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1643 {
1644         struct cfq_queue *__cfqq, *next;
1645
1646         list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1647                 cfq_resort_rr_list(__cfqq, 1);
1648
1649         if (!cfqq->slice_left)
1650                 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1651
1652         cfqq->slice_end = cfqq->slice_left + jiffies;
1653         __cfq_slice_expired(cfqd, cfqq, 1);
1654         __cfq_set_active_queue(cfqd, cfqq);
1655 }
1656
1657 /*
1658  * should really be a ll_rw_blk.c helper
1659  */
1660 static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1661 {
1662         request_queue_t *q = cfqd->queue;
1663
1664         if (!blk_queue_plugged(q))
1665                 q->request_fn(q);
1666         else
1667                 __generic_unplug_device(q);
1668 }
1669
1670 /*
1671  * Called when a new fs request (crq) is added (to cfqq). Check if there's
1672  * something we should do about it
1673  */
1674 static void
1675 cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1676                  struct cfq_rq *crq)
1677 {
1678         struct cfq_io_context *cic;
1679
1680         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1681
1682         /*
1683          * we never wait for an async request and we don't allow preemption
1684          * of an async request. so just return early
1685          */
1686         if (!cfq_crq_is_sync(crq))
1687                 return;
1688
1689         cic = crq->io_context;
1690
1691         cfq_update_io_thinktime(cfqd, cic);
1692         cfq_update_idle_window(cfqd, cfqq, cic);
1693
1694         cic->last_queue = jiffies;
1695
1696         if (cfqq == cfqd->active_queue) {
1697                 /*
1698                  * if we are waiting for a request for this queue, let it rip
1699                  * immediately and flag that we must not expire this queue
1700                  * just now
1701                  */
1702                 if (cfq_cfqq_wait_request(cfqq)) {
1703                         cfq_mark_cfqq_must_dispatch(cfqq);
1704                         del_timer(&cfqd->idle_slice_timer);
1705                         cfq_start_queueing(cfqd, cfqq);
1706                 }
1707         } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1708                 /*
1709                  * not the active queue - expire current slice if it is
1710                  * idle and has expired it's mean thinktime or this new queue
1711                  * has some old slice time left and is of higher priority
1712                  */
1713                 cfq_preempt_queue(cfqd, cfqq);
1714                 cfq_mark_cfqq_must_dispatch(cfqq);
1715                 cfq_start_queueing(cfqd, cfqq);
1716         }
1717 }
1718
1719 static void cfq_insert_request(request_queue_t *q, struct request *rq)
1720 {
1721         struct cfq_data *cfqd = q->elevator->elevator_data;
1722         struct cfq_rq *crq = RQ_DATA(rq);
1723         struct cfq_queue *cfqq = crq->cfq_queue;
1724
1725         cfq_init_prio_data(cfqq);
1726
1727         cfq_add_crq_rb(crq);
1728
1729         list_add_tail(&rq->queuelist, &cfqq->fifo);
1730
1731         if (rq_mergeable(rq))
1732                 cfq_add_crq_hash(cfqd, crq);
1733
1734         cfq_crq_enqueued(cfqd, cfqq, crq);
1735 }
1736
1737 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1738 {
1739         struct cfq_rq *crq = RQ_DATA(rq);
1740         struct cfq_queue *cfqq = crq->cfq_queue;
1741         struct cfq_data *cfqd = cfqq->cfqd;
1742         const int sync = cfq_crq_is_sync(crq);
1743         unsigned long now;
1744
1745         now = jiffies;
1746
1747         WARN_ON(!cfqd->rq_in_driver);
1748         WARN_ON(!cfqq->on_dispatch[sync]);
1749         cfqd->rq_in_driver--;
1750         cfqq->on_dispatch[sync]--;
1751
1752         if (!cfq_class_idle(cfqq))
1753                 cfqd->last_end_request = now;
1754
1755         if (!cfq_cfqq_dispatched(cfqq)) {
1756                 if (cfq_cfqq_on_rr(cfqq)) {
1757                         cfqq->service_last = now;
1758                         cfq_resort_rr_list(cfqq, 0);
1759                 }
1760                 cfq_schedule_dispatch(cfqd);
1761         }
1762
1763         if (cfq_crq_is_sync(crq))
1764                 crq->io_context->last_end_request = now;
1765 }
1766
1767 static struct request *
1768 cfq_former_request(request_queue_t *q, struct request *rq)
1769 {
1770         struct cfq_rq *crq = RQ_DATA(rq);
1771         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1772
1773         if (rbprev)
1774                 return rb_entry_crq(rbprev)->request;
1775
1776         return NULL;
1777 }
1778
1779 static struct request *
1780 cfq_latter_request(request_queue_t *q, struct request *rq)
1781 {
1782         struct cfq_rq *crq = RQ_DATA(rq);
1783         struct rb_node *rbnext = rb_next(&crq->rb_node);
1784
1785         if (rbnext)
1786                 return rb_entry_crq(rbnext)->request;
1787
1788         return NULL;
1789 }
1790
1791 /*
1792  * we temporarily boost lower priority queues if they are holding fs exclusive
1793  * resources. they are boosted to normal prio (CLASS_BE/4)
1794  */
1795 static void cfq_prio_boost(struct cfq_queue *cfqq)
1796 {
1797         const int ioprio_class = cfqq->ioprio_class;
1798         const int ioprio = cfqq->ioprio;
1799
1800         if (has_fs_excl()) {
1801                 /*
1802                  * boost idle prio on transactions that would lock out other
1803                  * users of the filesystem
1804                  */
1805                 if (cfq_class_idle(cfqq))
1806                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1807                 if (cfqq->ioprio > IOPRIO_NORM)
1808                         cfqq->ioprio = IOPRIO_NORM;
1809         } else {
1810                 /*
1811                  * check if we need to unboost the queue
1812                  */
1813                 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1814                         cfqq->ioprio_class = cfqq->org_ioprio_class;
1815                 if (cfqq->ioprio != cfqq->org_ioprio)
1816                         cfqq->ioprio = cfqq->org_ioprio;
1817         }
1818
1819         /*
1820          * refile between round-robin lists if we moved the priority class
1821          */
1822         if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1823             cfq_cfqq_on_rr(cfqq))
1824                 cfq_resort_rr_list(cfqq, 0);
1825 }
1826
1827 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1828 {
1829         if (rw == READ || process_sync(task))
1830                 return task->pid;
1831
1832         return CFQ_KEY_ASYNC;
1833 }
1834
1835 static inline int
1836 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1837                 struct task_struct *task, int rw)
1838 {
1839 #if 1
1840         if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1841             !cfq_cfqq_must_alloc_slice(cfqq)) {
1842                 cfq_mark_cfqq_must_alloc_slice(cfqq);
1843                 return ELV_MQUEUE_MUST;
1844         }
1845
1846         return ELV_MQUEUE_MAY;
1847 #else
1848         if (!cfqq || task->flags & PF_MEMALLOC)
1849                 return ELV_MQUEUE_MAY;
1850         if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
1851                 if (cfq_cfqq_wait_request(cfqq))
1852                         return ELV_MQUEUE_MUST;
1853
1854                 /*
1855                  * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1856                  * can quickly flood the queue with writes from a single task
1857                  */
1858                 if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
1859                         cfq_mark_cfqq_must_alloc_slice(cfqq);
1860                         return ELV_MQUEUE_MUST;
1861                 }
1862
1863                 return ELV_MQUEUE_MAY;
1864         }
1865         if (cfq_class_idle(cfqq))
1866                 return ELV_MQUEUE_NO;
1867         if (cfqq->allocated[rw] >= cfqd->max_queued) {
1868                 struct io_context *ioc = get_io_context(GFP_ATOMIC);
1869                 int ret = ELV_MQUEUE_NO;
1870
1871                 if (ioc && ioc->nr_batch_requests)
1872                         ret = ELV_MQUEUE_MAY;
1873
1874                 put_io_context(ioc);
1875                 return ret;
1876         }
1877
1878         return ELV_MQUEUE_MAY;
1879 #endif
1880 }
1881
1882 static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1883 {
1884         struct cfq_data *cfqd = q->elevator->elevator_data;
1885         struct task_struct *tsk = current;
1886         struct cfq_queue *cfqq;
1887
1888         /*
1889          * don't force setup of a queue from here, as a call to may_queue
1890          * does not necessarily imply that a request actually will be queued.
1891          * so just lookup a possibly existing queue, or return 'may queue'
1892          * if that fails
1893          */
1894         cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1895         if (cfqq) {
1896                 cfq_init_prio_data(cfqq);
1897                 cfq_prio_boost(cfqq);
1898
1899                 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1900         }
1901
1902         return ELV_MQUEUE_MAY;
1903 }
1904
1905 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1906 {
1907         struct cfq_data *cfqd = q->elevator->elevator_data;
1908         struct request_list *rl = &q->rq;
1909
1910         if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1911                 smp_mb();
1912                 if (waitqueue_active(&rl->wait[READ]))
1913                         wake_up(&rl->wait[READ]);
1914         }
1915
1916         if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
1917                 smp_mb();
1918                 if (waitqueue_active(&rl->wait[WRITE]))
1919                         wake_up(&rl->wait[WRITE]);
1920         }
1921 }
1922
1923 /*
1924  * queue lock held here
1925  */
1926 static void cfq_put_request(request_queue_t *q, struct request *rq)
1927 {
1928         struct cfq_data *cfqd = q->elevator->elevator_data;
1929         struct cfq_rq *crq = RQ_DATA(rq);
1930
1931         if (crq) {
1932                 struct cfq_queue *cfqq = crq->cfq_queue;
1933                 const int rw = rq_data_dir(rq);
1934
1935                 BUG_ON(!cfqq->allocated[rw]);
1936                 cfqq->allocated[rw]--;
1937
1938                 put_io_context(crq->io_context->ioc);
1939
1940                 mempool_free(crq, cfqd->crq_pool);
1941                 rq->elevator_private = NULL;
1942
1943                 cfq_check_waiters(q, cfqq);
1944                 cfq_put_queue(cfqq);
1945         }
1946 }
1947
1948 /*
1949  * Allocate cfq data structures associated with this request.
1950  */
1951 static int
1952 cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1953                 gfp_t gfp_mask)
1954 {
1955         struct cfq_data *cfqd = q->elevator->elevator_data;
1956         struct task_struct *tsk = current;
1957         struct cfq_io_context *cic;
1958         const int rw = rq_data_dir(rq);
1959         pid_t key = cfq_queue_pid(tsk, rw);
1960         struct cfq_queue *cfqq;
1961         struct cfq_rq *crq;
1962         unsigned long flags;
1963         int is_sync = key != CFQ_KEY_ASYNC;
1964
1965         might_sleep_if(gfp_mask & __GFP_WAIT);
1966
1967         cic = cfq_get_io_context(cfqd, key, gfp_mask);
1968
1969         spin_lock_irqsave(q->queue_lock, flags);
1970
1971         if (!cic)
1972                 goto queue_fail;
1973
1974         if (!cic->cfqq[is_sync]) {
1975                 cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
1976                 if (!cfqq)
1977                         goto queue_fail;
1978
1979                 cic->cfqq[is_sync] = cfqq;
1980         } else
1981                 cfqq = cic->cfqq[is_sync];
1982
1983         cfqq->allocated[rw]++;
1984         cfq_clear_cfqq_must_alloc(cfqq);
1985         cfqd->rq_starved = 0;
1986         atomic_inc(&cfqq->ref);
1987         spin_unlock_irqrestore(q->queue_lock, flags);
1988
1989         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1990         if (crq) {
1991                 RB_CLEAR(&crq->rb_node);
1992                 crq->rb_key = 0;
1993                 crq->request = rq;
1994                 INIT_HLIST_NODE(&crq->hash);
1995                 crq->cfq_queue = cfqq;
1996                 crq->io_context = cic;
1997
1998                 if (is_sync)
1999                         cfq_mark_crq_is_sync(crq);
2000                 else
2001                         cfq_clear_crq_is_sync(crq);
2002
2003                 rq->elevator_private = crq;
2004                 return 0;
2005         }
2006
2007         spin_lock_irqsave(q->queue_lock, flags);
2008         cfqq->allocated[rw]--;
2009         if (!(cfqq->allocated[0] + cfqq->allocated[1]))
2010                 cfq_mark_cfqq_must_alloc(cfqq);
2011         cfq_put_queue(cfqq);
2012 queue_fail:
2013         if (cic)
2014                 put_io_context(cic->ioc);
2015         /*
2016          * mark us rq allocation starved. we need to kickstart the process
2017          * ourselves if there are no pending requests that can do it for us.
2018          * that would be an extremely rare OOM situation
2019          */
2020         cfqd->rq_starved = 1;
2021         cfq_schedule_dispatch(cfqd);
2022         spin_unlock_irqrestore(q->queue_lock, flags);
2023         return 1;
2024 }
2025
2026 static void cfq_kick_queue(void *data)
2027 {
2028         request_queue_t *q = data;
2029         struct cfq_data *cfqd = q->elevator->elevator_data;
2030         unsigned long flags;
2031
2032         spin_lock_irqsave(q->queue_lock, flags);
2033
2034         if (cfqd->rq_starved) {
2035                 struct request_list *rl = &q->rq;
2036
2037                 /*
2038                  * we aren't guaranteed to get a request after this, but we
2039                  * have to be opportunistic
2040                  */
2041                 smp_mb();
2042                 if (waitqueue_active(&rl->wait[READ]))
2043                         wake_up(&rl->wait[READ]);
2044                 if (waitqueue_active(&rl->wait[WRITE]))
2045                         wake_up(&rl->wait[WRITE]);
2046         }
2047
2048         blk_remove_plug(q);
2049         q->request_fn(q);
2050         spin_unlock_irqrestore(q->queue_lock, flags);
2051 }
2052
2053 /*
2054  * Timer running if the active_queue is currently idling inside its time slice
2055  */
2056 static void cfq_idle_slice_timer(unsigned long data)
2057 {
2058         struct cfq_data *cfqd = (struct cfq_data *) data;
2059         struct cfq_queue *cfqq;
2060         unsigned long flags;
2061
2062         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2063
2064         if ((cfqq = cfqd->active_queue) != NULL) {
2065                 unsigned long now = jiffies;
2066
2067                 /*
2068                  * expired
2069                  */
2070                 if (time_after(now, cfqq->slice_end))
2071                         goto expire;
2072
2073                 /*
2074                  * only expire and reinvoke request handler, if there are
2075                  * other queues with pending requests
2076                  */
2077                 if (!cfqd->busy_queues) {
2078                         cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
2079                         add_timer(&cfqd->idle_slice_timer);
2080                         goto out_cont;
2081                 }
2082
2083                 /*
2084                  * not expired and it has a request pending, let it dispatch
2085                  */
2086                 if (!RB_EMPTY(&cfqq->sort_list)) {
2087                         cfq_mark_cfqq_must_dispatch(cfqq);
2088                         goto out_kick;
2089                 }
2090         }
2091 expire:
2092         cfq_slice_expired(cfqd, 0);
2093 out_kick:
2094         cfq_schedule_dispatch(cfqd);
2095 out_cont:
2096         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2097 }
2098
2099 /*
2100  * Timer running if an idle class queue is waiting for service
2101  */
2102 static void cfq_idle_class_timer(unsigned long data)
2103 {
2104         struct cfq_data *cfqd = (struct cfq_data *) data;
2105         unsigned long flags, end;
2106
2107         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2108
2109         /*
2110          * race with a non-idle queue, reset timer
2111          */
2112         end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2113         if (!time_after_eq(jiffies, end)) {
2114                 cfqd->idle_class_timer.expires = end;
2115                 add_timer(&cfqd->idle_class_timer);
2116         } else
2117                 cfq_schedule_dispatch(cfqd);
2118
2119         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2120 }
2121
2122 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2123 {
2124         del_timer_sync(&cfqd->idle_slice_timer);
2125         del_timer_sync(&cfqd->idle_class_timer);
2126         blk_sync_queue(cfqd->queue);
2127 }
2128
2129 static void cfq_put_cfqd(struct cfq_data *cfqd)
2130 {
2131         if (!atomic_dec_and_test(&cfqd->ref))
2132                 return;
2133
2134         cfq_shutdown_timer_wq(cfqd);
2135
2136         mempool_destroy(cfqd->crq_pool);
2137         kfree(cfqd->crq_hash);
2138         kfree(cfqd->cfq_hash);
2139         kfree(cfqd);
2140 }
2141
2142 static void cfq_exit_queue(elevator_t *e)
2143 {
2144         struct cfq_data *cfqd = e->elevator_data;
2145         request_queue_t *q = cfqd->queue;
2146
2147         cfq_shutdown_timer_wq(cfqd);
2148         write_lock(&cfq_exit_lock);
2149         spin_lock_irq(q->queue_lock);
2150         if (cfqd->active_queue)
2151                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2152         while(!list_empty(&cfqd->cic_list)) {
2153                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2154                                                         struct cfq_io_context,
2155                                                         queue_list);
2156                 if (cic->cfqq[ASYNC]) {
2157                         cfq_put_queue(cic->cfqq[ASYNC]);
2158                         cic->cfqq[ASYNC] = NULL;
2159                 }
2160                 if (cic->cfqq[SYNC]) {
2161                         cfq_put_queue(cic->cfqq[SYNC]);
2162                         cic->cfqq[SYNC] = NULL;
2163                 }
2164                 cic->key = NULL;
2165                 list_del_init(&cic->queue_list);
2166         }
2167         spin_unlock_irq(q->queue_lock);
2168         write_unlock(&cfq_exit_lock);
2169         cfq_put_cfqd(cfqd);
2170 }
2171
2172 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2173 {
2174         struct cfq_data *cfqd;
2175         int i;
2176
2177         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2178         if (!cfqd)
2179                 return -ENOMEM;
2180
2181         memset(cfqd, 0, sizeof(*cfqd));
2182
2183         for (i = 0; i < CFQ_PRIO_LISTS; i++)
2184                 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2185
2186         INIT_LIST_HEAD(&cfqd->busy_rr);
2187         INIT_LIST_HEAD(&cfqd->cur_rr);
2188         INIT_LIST_HEAD(&cfqd->idle_rr);
2189         INIT_LIST_HEAD(&cfqd->empty_list);
2190         INIT_LIST_HEAD(&cfqd->cic_list);
2191
2192         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2193         if (!cfqd->crq_hash)
2194                 goto out_crqhash;
2195
2196         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2197         if (!cfqd->cfq_hash)
2198                 goto out_cfqhash;
2199
2200         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
2201         if (!cfqd->crq_pool)
2202                 goto out_crqpool;
2203
2204         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2205                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2206         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2207                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2208
2209         e->elevator_data = cfqd;
2210
2211         cfqd->queue = q;
2212
2213         cfqd->max_queued = q->nr_requests / 4;
2214         q->nr_batching = cfq_queued;
2215
2216         init_timer(&cfqd->idle_slice_timer);
2217         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2218         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2219
2220         init_timer(&cfqd->idle_class_timer);
2221         cfqd->idle_class_timer.function = cfq_idle_class_timer;
2222         cfqd->idle_class_timer.data = (unsigned long) cfqd;
2223
2224         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2225
2226         atomic_set(&cfqd->ref, 1);
2227
2228         cfqd->cfq_queued = cfq_queued;
2229         cfqd->cfq_quantum = cfq_quantum;
2230         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2231         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2232         cfqd->cfq_back_max = cfq_back_max;
2233         cfqd->cfq_back_penalty = cfq_back_penalty;
2234         cfqd->cfq_slice[0] = cfq_slice_async;
2235         cfqd->cfq_slice[1] = cfq_slice_sync;
2236         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2237         cfqd->cfq_slice_idle = cfq_slice_idle;
2238         cfqd->cfq_max_depth = cfq_max_depth;
2239
2240         return 0;
2241 out_crqpool:
2242         kfree(cfqd->cfq_hash);
2243 out_cfqhash:
2244         kfree(cfqd->crq_hash);
2245 out_crqhash:
2246         kfree(cfqd);
2247         return -ENOMEM;
2248 }
2249
2250 static void cfq_slab_kill(void)
2251 {
2252         if (crq_pool)
2253                 kmem_cache_destroy(crq_pool);
2254         if (cfq_pool)
2255                 kmem_cache_destroy(cfq_pool);
2256         if (cfq_ioc_pool)
2257                 kmem_cache_destroy(cfq_ioc_pool);
2258 }
2259
2260 static int __init cfq_slab_setup(void)
2261 {
2262         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2263                                         NULL, NULL);
2264         if (!crq_pool)
2265                 goto fail;
2266
2267         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2268                                         NULL, NULL);
2269         if (!cfq_pool)
2270                 goto fail;
2271
2272         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2273                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2274         if (!cfq_ioc_pool)
2275                 goto fail;
2276
2277         return 0;
2278 fail:
2279         cfq_slab_kill();
2280         return -ENOMEM;
2281 }
2282
2283 /*
2284  * sysfs parts below -->
2285  */
2286 struct cfq_fs_entry {
2287         struct attribute attr;
2288         ssize_t (*show)(struct cfq_data *, char *);
2289         ssize_t (*store)(struct cfq_data *, const char *, size_t);
2290 };
2291
2292 static ssize_t
2293 cfq_var_show(unsigned int var, char *page)
2294 {
2295         return sprintf(page, "%d\n", var);
2296 }
2297
2298 static ssize_t
2299 cfq_var_store(unsigned int *var, const char *page, size_t count)
2300 {
2301         char *p = (char *) page;
2302
2303         *var = simple_strtoul(p, &p, 10);
2304         return count;
2305 }
2306
2307 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2308 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
2309 {                                                                       \
2310         unsigned int __data = __VAR;                                    \
2311         if (__CONV)                                                     \
2312                 __data = jiffies_to_msecs(__data);                      \
2313         return cfq_var_show(__data, (page));                            \
2314 }
2315 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2316 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2317 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2318 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2319 SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
2320 SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
2321 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2322 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2323 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2324 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2325 SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
2326 #undef SHOW_FUNCTION
2327
2328 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2329 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
2330 {                                                                       \
2331         unsigned int __data;                                            \
2332         int ret = cfq_var_store(&__data, (page), count);                \
2333         if (__data < (MIN))                                             \
2334                 __data = (MIN);                                         \
2335         else if (__data > (MAX))                                        \
2336                 __data = (MAX);                                         \
2337         if (__CONV)                                                     \
2338                 *(__PTR) = msecs_to_jiffies(__data);                    \
2339         else                                                            \
2340                 *(__PTR) = __data;                                      \
2341         return ret;                                                     \
2342 }
2343 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2344 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2345 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2346 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2347 STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2348 STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2349 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2350 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2351 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2352 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2353 STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
2354 #undef STORE_FUNCTION
2355
2356 static struct cfq_fs_entry cfq_quantum_entry = {
2357         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
2358         .show = cfq_quantum_show,
2359         .store = cfq_quantum_store,
2360 };
2361 static struct cfq_fs_entry cfq_queued_entry = {
2362         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
2363         .show = cfq_queued_show,
2364         .store = cfq_queued_store,
2365 };
2366 static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
2367         .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
2368         .show = cfq_fifo_expire_sync_show,
2369         .store = cfq_fifo_expire_sync_store,
2370 };
2371 static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
2372         .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
2373         .show = cfq_fifo_expire_async_show,
2374         .store = cfq_fifo_expire_async_store,
2375 };
2376 static struct cfq_fs_entry cfq_back_max_entry = {
2377         .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
2378         .show = cfq_back_max_show,
2379         .store = cfq_back_max_store,
2380 };
2381 static struct cfq_fs_entry cfq_back_penalty_entry = {
2382         .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
2383         .show = cfq_back_penalty_show,
2384         .store = cfq_back_penalty_store,
2385 };
2386 static struct cfq_fs_entry cfq_slice_sync_entry = {
2387         .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
2388         .show = cfq_slice_sync_show,
2389         .store = cfq_slice_sync_store,
2390 };
2391 static struct cfq_fs_entry cfq_slice_async_entry = {
2392         .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
2393         .show = cfq_slice_async_show,
2394         .store = cfq_slice_async_store,
2395 };
2396 static struct cfq_fs_entry cfq_slice_async_rq_entry = {
2397         .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
2398         .show = cfq_slice_async_rq_show,
2399         .store = cfq_slice_async_rq_store,
2400 };
2401 static struct cfq_fs_entry cfq_slice_idle_entry = {
2402         .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
2403         .show = cfq_slice_idle_show,
2404         .store = cfq_slice_idle_store,
2405 };
2406 static struct cfq_fs_entry cfq_max_depth_entry = {
2407         .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
2408         .show = cfq_max_depth_show,
2409         .store = cfq_max_depth_store,
2410 };
2411
2412 static struct attribute *default_attrs[] = {
2413         &cfq_quantum_entry.attr,
2414         &cfq_queued_entry.attr,
2415         &cfq_fifo_expire_sync_entry.attr,
2416         &cfq_fifo_expire_async_entry.attr,
2417         &cfq_back_max_entry.attr,
2418         &cfq_back_penalty_entry.attr,
2419         &cfq_slice_sync_entry.attr,
2420         &cfq_slice_async_entry.attr,
2421         &cfq_slice_async_rq_entry.attr,
2422         &cfq_slice_idle_entry.attr,
2423         &cfq_max_depth_entry.attr,
2424         NULL,
2425 };
2426
2427 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
2428
2429 static ssize_t
2430 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2431 {
2432         elevator_t *e = container_of(kobj, elevator_t, kobj);
2433         struct cfq_fs_entry *entry = to_cfq(attr);
2434
2435         if (!entry->show)
2436                 return -EIO;
2437
2438         return entry->show(e->elevator_data, page);
2439 }
2440
2441 static ssize_t
2442 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
2443                const char *page, size_t length)
2444 {
2445         elevator_t *e = container_of(kobj, elevator_t, kobj);
2446         struct cfq_fs_entry *entry = to_cfq(attr);
2447
2448         if (!entry->store)
2449                 return -EIO;
2450
2451         return entry->store(e->elevator_data, page, length);
2452 }
2453
2454 static struct sysfs_ops cfq_sysfs_ops = {
2455         .show   = cfq_attr_show,
2456         .store  = cfq_attr_store,
2457 };
2458
2459 static struct kobj_type cfq_ktype = {
2460         .sysfs_ops      = &cfq_sysfs_ops,
2461         .default_attrs  = default_attrs,
2462 };
2463
2464 static struct elevator_type iosched_cfq = {
2465         .ops = {
2466                 .elevator_merge_fn =            cfq_merge,
2467                 .elevator_merged_fn =           cfq_merged_request,
2468                 .elevator_merge_req_fn =        cfq_merged_requests,
2469                 .elevator_dispatch_fn =         cfq_dispatch_requests,
2470                 .elevator_add_req_fn =          cfq_insert_request,
2471                 .elevator_activate_req_fn =     cfq_activate_request,
2472                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
2473                 .elevator_queue_empty_fn =      cfq_queue_empty,
2474                 .elevator_completed_req_fn =    cfq_completed_request,
2475                 .elevator_former_req_fn =       cfq_former_request,
2476                 .elevator_latter_req_fn =       cfq_latter_request,
2477                 .elevator_set_req_fn =          cfq_set_request,
2478                 .elevator_put_req_fn =          cfq_put_request,
2479                 .elevator_may_queue_fn =        cfq_may_queue,
2480                 .elevator_init_fn =             cfq_init_queue,
2481                 .elevator_exit_fn =             cfq_exit_queue,
2482                 .trim =                         cfq_trim,
2483         },
2484         .elevator_ktype =       &cfq_ktype,
2485         .elevator_name =        "cfq",
2486         .elevator_owner =       THIS_MODULE,
2487 };
2488
2489 static int __init cfq_init(void)
2490 {
2491         int ret;
2492
2493         /*
2494          * could be 0 on HZ < 1000 setups
2495          */
2496         if (!cfq_slice_async)
2497                 cfq_slice_async = 1;
2498         if (!cfq_slice_idle)
2499                 cfq_slice_idle = 1;
2500
2501         if (cfq_slab_setup())
2502                 return -ENOMEM;
2503
2504         ret = elv_register(&iosched_cfq);
2505         if (ret)
2506                 cfq_slab_kill();
2507
2508         return ret;
2509 }
2510
2511 static void __exit cfq_exit(void)
2512 {
2513         elv_unregister(&iosched_cfq);
2514         cfq_slab_kill();
2515 }
2516
2517 module_init(cfq_init);
2518 module_exit(cfq_exit);
2519
2520 MODULE_AUTHOR("Jens Axboe");
2521 MODULE_LICENSE("GPL");
2522 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");