[PATCH] cfq-iosched: Kill O(N) runtime of cfq_resort_rr_list()
[safe/jmp/linux-2.6] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
8  */
9 #include <linux/module.h>
10 #include <linux/blkdev.h>
11 #include <linux/elevator.h>
12 #include <linux/hash.h>
13 #include <linux/rbtree.h>
14 #include <linux/ioprio.h>
15
16 /*
17  * tunables
18  */
19 static const int cfq_quantum = 4;               /* max queue in one round of service */
20 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
21 static const int cfq_back_max = 16 * 1024;      /* maximum backwards seek, in KiB */
22 static const int cfq_back_penalty = 2;          /* penalty of a backwards seek */
23
24 static const int cfq_slice_sync = HZ / 10;
25 static int cfq_slice_async = HZ / 25;
26 static const int cfq_slice_async_rq = 2;
27 static int cfq_slice_idle = HZ / 125;
28
29 #define CFQ_IDLE_GRACE          (HZ / 10)
30 #define CFQ_SLICE_SCALE         (5)
31
32 #define CFQ_KEY_ASYNC           (0)
33
34 /*
35  * for the hash of cfqq inside the cfqd
36  */
37 #define CFQ_QHASH_SHIFT         6
38 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
39 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
40
41 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
42
43 #define RQ_CIC(rq)              ((struct cfq_io_context*)(rq)->elevator_private)
44 #define RQ_CFQQ(rq)             ((rq)->elevator_private2)
45
46 static kmem_cache_t *cfq_pool;
47 static kmem_cache_t *cfq_ioc_pool;
48
49 static DEFINE_PER_CPU(unsigned long, ioc_count);
50 static struct completion *ioc_gone;
51
52 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
53 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
54 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
55
56 #define ASYNC                   (0)
57 #define SYNC                    (1)
58
59 #define cfq_cfqq_dispatched(cfqq)       \
60         ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
61
62 #define cfq_cfqq_class_sync(cfqq)       ((cfqq)->key != CFQ_KEY_ASYNC)
63
64 #define cfq_cfqq_sync(cfqq)             \
65         (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
66
67 #define sample_valid(samples)   ((samples) > 80)
68
69 /*
70  * Per block device queue structure
71  */
72 struct cfq_data {
73         request_queue_t *queue;
74
75         /*
76          * rr list of queues with requests and the count of them
77          */
78         struct list_head rr_list[CFQ_PRIO_LISTS];
79         struct list_head busy_rr;
80         struct list_head cur_rr;
81         struct list_head idle_rr;
82         unsigned int busy_queues;
83
84         /*
85          * non-ordered list of empty cfqq's
86          */
87         struct list_head empty_list;
88
89         /*
90          * cfqq lookup hash
91          */
92         struct hlist_head *cfq_hash;
93
94         int rq_in_driver;
95         int hw_tag;
96
97         /*
98          * idle window management
99          */
100         struct timer_list idle_slice_timer;
101         struct work_struct unplug_work;
102
103         struct cfq_queue *active_queue;
104         struct cfq_io_context *active_cic;
105         int cur_prio, cur_end_prio;
106         unsigned int dispatch_slice;
107
108         struct timer_list idle_class_timer;
109
110         sector_t last_sector;
111         unsigned long last_end_request;
112
113         /*
114          * tunables, see top of file
115          */
116         unsigned int cfq_quantum;
117         unsigned int cfq_fifo_expire[2];
118         unsigned int cfq_back_penalty;
119         unsigned int cfq_back_max;
120         unsigned int cfq_slice[2];
121         unsigned int cfq_slice_async_rq;
122         unsigned int cfq_slice_idle;
123
124         struct list_head cic_list;
125 };
126
127 /*
128  * Per process-grouping structure
129  */
130 struct cfq_queue {
131         /* reference count */
132         atomic_t ref;
133         /* parent cfq_data */
134         struct cfq_data *cfqd;
135         /* cfqq lookup hash */
136         struct hlist_node cfq_hash;
137         /* hash key */
138         unsigned int key;
139         /* on either rr or empty list of cfqd */
140         struct list_head cfq_list;
141         /* sorted list of pending requests */
142         struct rb_root sort_list;
143         /* if fifo isn't expired, next request to serve */
144         struct request *next_rq;
145         /* requests queued in sort_list */
146         int queued[2];
147         /* currently allocated requests */
148         int allocated[2];
149         /* fifo list of requests in sort_list */
150         struct list_head fifo;
151
152         unsigned long slice_start;
153         unsigned long slice_end;
154         unsigned long slice_left;
155
156         /* number of requests that are on the dispatch list */
157         int on_dispatch[2];
158
159         /* io prio of this group */
160         unsigned short ioprio, org_ioprio;
161         unsigned short ioprio_class, org_ioprio_class;
162
163         /* various state flags, see below */
164         unsigned int flags;
165 };
166
167 enum cfqq_state_flags {
168         CFQ_CFQQ_FLAG_on_rr = 0,
169         CFQ_CFQQ_FLAG_wait_request,
170         CFQ_CFQQ_FLAG_must_alloc,
171         CFQ_CFQQ_FLAG_must_alloc_slice,
172         CFQ_CFQQ_FLAG_must_dispatch,
173         CFQ_CFQQ_FLAG_fifo_expire,
174         CFQ_CFQQ_FLAG_idle_window,
175         CFQ_CFQQ_FLAG_prio_changed,
176         CFQ_CFQQ_FLAG_queue_new,
177 };
178
179 #define CFQ_CFQQ_FNS(name)                                              \
180 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
181 {                                                                       \
182         cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);                     \
183 }                                                                       \
184 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
185 {                                                                       \
186         cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                    \
187 }                                                                       \
188 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
189 {                                                                       \
190         return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;        \
191 }
192
193 CFQ_CFQQ_FNS(on_rr);
194 CFQ_CFQQ_FNS(wait_request);
195 CFQ_CFQQ_FNS(must_alloc);
196 CFQ_CFQQ_FNS(must_alloc_slice);
197 CFQ_CFQQ_FNS(must_dispatch);
198 CFQ_CFQQ_FNS(fifo_expire);
199 CFQ_CFQQ_FNS(idle_window);
200 CFQ_CFQQ_FNS(prio_changed);
201 CFQ_CFQQ_FNS(queue_new);
202 #undef CFQ_CFQQ_FNS
203
204 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
205 static void cfq_dispatch_insert(request_queue_t *, struct request *);
206 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
207
208 /*
209  * scheduler run of queue, if there are requests pending and no one in the
210  * driver that will restart queueing
211  */
212 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
213 {
214         if (cfqd->busy_queues)
215                 kblockd_schedule_work(&cfqd->unplug_work);
216 }
217
218 static int cfq_queue_empty(request_queue_t *q)
219 {
220         struct cfq_data *cfqd = q->elevator->elevator_data;
221
222         return !cfqd->busy_queues;
223 }
224
225 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
226 {
227         if (rw == READ || rw == WRITE_SYNC)
228                 return task->pid;
229
230         return CFQ_KEY_ASYNC;
231 }
232
233 /*
234  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
235  * We choose the request that is closest to the head right now. Distance
236  * behind the head is penalized and only allowed to a certain extent.
237  */
238 static struct request *
239 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
240 {
241         sector_t last, s1, s2, d1 = 0, d2 = 0;
242         unsigned long back_max;
243 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
244 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
245         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
246
247         if (rq1 == NULL || rq1 == rq2)
248                 return rq2;
249         if (rq2 == NULL)
250                 return rq1;
251
252         if (rq_is_sync(rq1) && !rq_is_sync(rq2))
253                 return rq1;
254         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
255                 return rq2;
256
257         s1 = rq1->sector;
258         s2 = rq2->sector;
259
260         last = cfqd->last_sector;
261
262         /*
263          * by definition, 1KiB is 2 sectors
264          */
265         back_max = cfqd->cfq_back_max * 2;
266
267         /*
268          * Strict one way elevator _except_ in the case where we allow
269          * short backward seeks which are biased as twice the cost of a
270          * similar forward seek.
271          */
272         if (s1 >= last)
273                 d1 = s1 - last;
274         else if (s1 + back_max >= last)
275                 d1 = (last - s1) * cfqd->cfq_back_penalty;
276         else
277                 wrap |= CFQ_RQ1_WRAP;
278
279         if (s2 >= last)
280                 d2 = s2 - last;
281         else if (s2 + back_max >= last)
282                 d2 = (last - s2) * cfqd->cfq_back_penalty;
283         else
284                 wrap |= CFQ_RQ2_WRAP;
285
286         /* Found required data */
287
288         /*
289          * By doing switch() on the bit mask "wrap" we avoid having to
290          * check two variables for all permutations: --> faster!
291          */
292         switch (wrap) {
293         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
294                 if (d1 < d2)
295                         return rq1;
296                 else if (d2 < d1)
297                         return rq2;
298                 else {
299                         if (s1 >= s2)
300                                 return rq1;
301                         else
302                                 return rq2;
303                 }
304
305         case CFQ_RQ2_WRAP:
306                 return rq1;
307         case CFQ_RQ1_WRAP:
308                 return rq2;
309         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
310         default:
311                 /*
312                  * Since both rqs are wrapped,
313                  * start with the one that's further behind head
314                  * (--> only *one* back seek required),
315                  * since back seek takes more time than forward.
316                  */
317                 if (s1 <= s2)
318                         return rq1;
319                 else
320                         return rq2;
321         }
322 }
323
324 /*
325  * would be nice to take fifo expire time into account as well
326  */
327 static struct request *
328 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
329                   struct request *last)
330 {
331         struct rb_node *rbnext = rb_next(&last->rb_node);
332         struct rb_node *rbprev = rb_prev(&last->rb_node);
333         struct request *next = NULL, *prev = NULL;
334
335         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
336
337         if (rbprev)
338                 prev = rb_entry_rq(rbprev);
339
340         if (rbnext)
341                 next = rb_entry_rq(rbnext);
342         else {
343                 rbnext = rb_first(&cfqq->sort_list);
344                 if (rbnext && rbnext != &last->rb_node)
345                         next = rb_entry_rq(rbnext);
346         }
347
348         return cfq_choose_req(cfqd, next, prev);
349 }
350
351 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
352 {
353         struct cfq_data *cfqd = cfqq->cfqd;
354         struct list_head *list;
355
356         BUG_ON(!cfq_cfqq_on_rr(cfqq));
357
358         list_del(&cfqq->cfq_list);
359
360         if (cfq_class_rt(cfqq))
361                 list = &cfqd->cur_rr;
362         else if (cfq_class_idle(cfqq))
363                 list = &cfqd->idle_rr;
364         else {
365                 /*
366                  * if cfqq has requests in flight, don't allow it to be
367                  * found in cfq_set_active_queue before it has finished them.
368                  * this is done to increase fairness between a process that
369                  * has lots of io pending vs one that only generates one
370                  * sporadically or synchronously
371                  */
372                 if (cfq_cfqq_dispatched(cfqq))
373                         list = &cfqd->busy_rr;
374                 else
375                         list = &cfqd->rr_list[cfqq->ioprio];
376         }
377
378         /*
379          * If this queue was preempted or is new (never been serviced), let
380          * it be added first for fairness but beind other new queues.
381          * Otherwise, just add to the back  of the list.
382          */
383         if (preempted || cfq_cfqq_queue_new(cfqq)) {
384                 struct list_head *n = list;
385                 struct cfq_queue *__cfqq;
386
387                 while (n->next != list) {
388                         __cfqq = list_entry_cfqq(n->next);
389                         if (!cfq_cfqq_queue_new(__cfqq))
390                                 break;
391
392                         n = n->next;
393                 }
394
395                 list = n;
396         }
397
398         list_add_tail(&cfqq->cfq_list, list);
399 }
400
401 /*
402  * add to busy list of queues for service, trying to be fair in ordering
403  * the pending list according to last request service
404  */
405 static inline void
406 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
407 {
408         BUG_ON(cfq_cfqq_on_rr(cfqq));
409         cfq_mark_cfqq_on_rr(cfqq);
410         cfqd->busy_queues++;
411
412         cfq_resort_rr_list(cfqq, 0);
413 }
414
415 static inline void
416 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
417 {
418         BUG_ON(!cfq_cfqq_on_rr(cfqq));
419         cfq_clear_cfqq_on_rr(cfqq);
420         list_move(&cfqq->cfq_list, &cfqd->empty_list);
421
422         BUG_ON(!cfqd->busy_queues);
423         cfqd->busy_queues--;
424 }
425
426 /*
427  * rb tree support functions
428  */
429 static inline void cfq_del_rq_rb(struct request *rq)
430 {
431         struct cfq_queue *cfqq = RQ_CFQQ(rq);
432         struct cfq_data *cfqd = cfqq->cfqd;
433         const int sync = rq_is_sync(rq);
434
435         BUG_ON(!cfqq->queued[sync]);
436         cfqq->queued[sync]--;
437
438         elv_rb_del(&cfqq->sort_list, rq);
439
440         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
441                 cfq_del_cfqq_rr(cfqd, cfqq);
442 }
443
444 static void cfq_add_rq_rb(struct request *rq)
445 {
446         struct cfq_queue *cfqq = RQ_CFQQ(rq);
447         struct cfq_data *cfqd = cfqq->cfqd;
448         struct request *__alias;
449
450         cfqq->queued[rq_is_sync(rq)]++;
451
452         /*
453          * looks a little odd, but the first insert might return an alias.
454          * if that happens, put the alias on the dispatch list
455          */
456         while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
457                 cfq_dispatch_insert(cfqd->queue, __alias);
458 }
459
460 static inline void
461 cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
462 {
463         elv_rb_del(&cfqq->sort_list, rq);
464         cfqq->queued[rq_is_sync(rq)]--;
465         cfq_add_rq_rb(rq);
466 }
467
468 static struct request *
469 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
470 {
471         struct task_struct *tsk = current;
472         pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
473         struct cfq_queue *cfqq;
474
475         cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
476         if (cfqq) {
477                 sector_t sector = bio->bi_sector + bio_sectors(bio);
478
479                 return elv_rb_find(&cfqq->sort_list, sector);
480         }
481
482         return NULL;
483 }
484
485 static void cfq_activate_request(request_queue_t *q, struct request *rq)
486 {
487         struct cfq_data *cfqd = q->elevator->elevator_data;
488
489         cfqd->rq_in_driver++;
490
491         /*
492          * If the depth is larger 1, it really could be queueing. But lets
493          * make the mark a little higher - idling could still be good for
494          * low queueing, and a low queueing number could also just indicate
495          * a SCSI mid layer like behaviour where limit+1 is often seen.
496          */
497         if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
498                 cfqd->hw_tag = 1;
499 }
500
501 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
502 {
503         struct cfq_data *cfqd = q->elevator->elevator_data;
504
505         WARN_ON(!cfqd->rq_in_driver);
506         cfqd->rq_in_driver--;
507 }
508
509 static void cfq_remove_request(struct request *rq)
510 {
511         struct cfq_queue *cfqq = RQ_CFQQ(rq);
512
513         if (cfqq->next_rq == rq)
514                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
515
516         list_del_init(&rq->queuelist);
517         cfq_del_rq_rb(rq);
518 }
519
520 static int
521 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
522 {
523         struct cfq_data *cfqd = q->elevator->elevator_data;
524         struct request *__rq;
525
526         __rq = cfq_find_rq_fmerge(cfqd, bio);
527         if (__rq && elv_rq_merge_ok(__rq, bio)) {
528                 *req = __rq;
529                 return ELEVATOR_FRONT_MERGE;
530         }
531
532         return ELEVATOR_NO_MERGE;
533 }
534
535 static void cfq_merged_request(request_queue_t *q, struct request *req,
536                                int type)
537 {
538         if (type == ELEVATOR_FRONT_MERGE) {
539                 struct cfq_queue *cfqq = RQ_CFQQ(req);
540
541                 cfq_reposition_rq_rb(cfqq, req);
542         }
543 }
544
545 static void
546 cfq_merged_requests(request_queue_t *q, struct request *rq,
547                     struct request *next)
548 {
549         /*
550          * reposition in fifo if next is older than rq
551          */
552         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
553             time_before(next->start_time, rq->start_time))
554                 list_move(&rq->queuelist, &next->queuelist);
555
556         cfq_remove_request(next);
557 }
558
559 static inline void
560 __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
561 {
562         if (cfqq) {
563                 /*
564                  * stop potential idle class queues waiting service
565                  */
566                 del_timer(&cfqd->idle_class_timer);
567
568                 cfqq->slice_start = jiffies;
569                 cfqq->slice_end = 0;
570                 cfqq->slice_left = 0;
571                 cfq_clear_cfqq_must_alloc_slice(cfqq);
572                 cfq_clear_cfqq_fifo_expire(cfqq);
573         }
574
575         cfqd->active_queue = cfqq;
576 }
577
578 /*
579  * current cfqq expired its slice (or was too idle), select new one
580  */
581 static void
582 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
583                     int preempted)
584 {
585         unsigned long now = jiffies;
586
587         if (cfq_cfqq_wait_request(cfqq))
588                 del_timer(&cfqd->idle_slice_timer);
589
590         if (!preempted && !cfq_cfqq_dispatched(cfqq))
591                 cfq_schedule_dispatch(cfqd);
592
593         cfq_clear_cfqq_must_dispatch(cfqq);
594         cfq_clear_cfqq_wait_request(cfqq);
595         cfq_clear_cfqq_queue_new(cfqq);
596
597         /*
598          * store what was left of this slice, if the queue idled out
599          * or was preempted
600          */
601         if (time_after(cfqq->slice_end, now))
602                 cfqq->slice_left = cfqq->slice_end - now;
603         else
604                 cfqq->slice_left = 0;
605
606         if (cfq_cfqq_on_rr(cfqq))
607                 cfq_resort_rr_list(cfqq, preempted);
608
609         if (cfqq == cfqd->active_queue)
610                 cfqd->active_queue = NULL;
611
612         if (cfqd->active_cic) {
613                 put_io_context(cfqd->active_cic->ioc);
614                 cfqd->active_cic = NULL;
615         }
616
617         cfqd->dispatch_slice = 0;
618 }
619
620 static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
621 {
622         struct cfq_queue *cfqq = cfqd->active_queue;
623
624         if (cfqq)
625                 __cfq_slice_expired(cfqd, cfqq, preempted);
626 }
627
628 /*
629  * 0
630  * 0,1
631  * 0,1,2
632  * 0,1,2,3
633  * 0,1,2,3,4
634  * 0,1,2,3,4,5
635  * 0,1,2,3,4,5,6
636  * 0,1,2,3,4,5,6,7
637  */
638 static int cfq_get_next_prio_level(struct cfq_data *cfqd)
639 {
640         int prio, wrap;
641
642         prio = -1;
643         wrap = 0;
644         do {
645                 int p;
646
647                 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
648                         if (!list_empty(&cfqd->rr_list[p])) {
649                                 prio = p;
650                                 break;
651                         }
652                 }
653
654                 if (prio != -1)
655                         break;
656                 cfqd->cur_prio = 0;
657                 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
658                         cfqd->cur_end_prio = 0;
659                         if (wrap)
660                                 break;
661                         wrap = 1;
662                 }
663         } while (1);
664
665         if (unlikely(prio == -1))
666                 return -1;
667
668         BUG_ON(prio >= CFQ_PRIO_LISTS);
669
670         list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
671
672         cfqd->cur_prio = prio + 1;
673         if (cfqd->cur_prio > cfqd->cur_end_prio) {
674                 cfqd->cur_end_prio = cfqd->cur_prio;
675                 cfqd->cur_prio = 0;
676         }
677         if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
678                 cfqd->cur_prio = 0;
679                 cfqd->cur_end_prio = 0;
680         }
681
682         return prio;
683 }
684
685 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
686 {
687         struct cfq_queue *cfqq = NULL;
688
689         if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
690                 /*
691                  * if current list is non-empty, grab first entry. if it is
692                  * empty, get next prio level and grab first entry then if any
693                  * are spliced
694                  */
695                 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
696         } else if (!list_empty(&cfqd->busy_rr)) {
697                 /*
698                  * If no new queues are available, check if the busy list has
699                  * some before falling back to idle io.
700                  */
701                 cfqq = list_entry_cfqq(cfqd->busy_rr.next);
702         } else if (!list_empty(&cfqd->idle_rr)) {
703                 /*
704                  * if we have idle queues and no rt or be queues had pending
705                  * requests, either allow immediate service if the grace period
706                  * has passed or arm the idle grace timer
707                  */
708                 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
709
710                 if (time_after_eq(jiffies, end))
711                         cfqq = list_entry_cfqq(cfqd->idle_rr.next);
712                 else
713                         mod_timer(&cfqd->idle_class_timer, end);
714         }
715
716         __cfq_set_active_queue(cfqd, cfqq);
717         return cfqq;
718 }
719
720 #define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024))
721
722 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
723
724 {
725         struct cfq_io_context *cic;
726         unsigned long sl;
727
728         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
729         WARN_ON(cfqq != cfqd->active_queue);
730
731         /*
732          * idle is disabled, either manually or by past process history
733          */
734         if (!cfqd->cfq_slice_idle)
735                 return 0;
736         if (!cfq_cfqq_idle_window(cfqq))
737                 return 0;
738         /*
739          * task has exited, don't wait
740          */
741         cic = cfqd->active_cic;
742         if (!cic || !cic->ioc->task)
743                 return 0;
744
745         cfq_mark_cfqq_must_dispatch(cfqq);
746         cfq_mark_cfqq_wait_request(cfqq);
747
748         sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
749
750         /*
751          * we don't want to idle for seeks, but we do want to allow
752          * fair distribution of slice time for a process doing back-to-back
753          * seeks. so allow a little bit of time for him to submit a new rq
754          */
755         if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
756                 sl = min(sl, msecs_to_jiffies(2));
757
758         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
759         return 1;
760 }
761
762 static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
763 {
764         struct cfq_data *cfqd = q->elevator->elevator_data;
765         struct cfq_queue *cfqq = RQ_CFQQ(rq);
766
767         cfq_remove_request(rq);
768         cfqq->on_dispatch[rq_is_sync(rq)]++;
769         elv_dispatch_sort(q, rq);
770
771         rq = list_entry(q->queue_head.prev, struct request, queuelist);
772         cfqd->last_sector = rq->sector + rq->nr_sectors;
773 }
774
775 /*
776  * return expired entry, or NULL to just start from scratch in rbtree
777  */
778 static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
779 {
780         struct cfq_data *cfqd = cfqq->cfqd;
781         struct request *rq;
782         int fifo;
783
784         if (cfq_cfqq_fifo_expire(cfqq))
785                 return NULL;
786         if (list_empty(&cfqq->fifo))
787                 return NULL;
788
789         fifo = cfq_cfqq_class_sync(cfqq);
790         rq = rq_entry_fifo(cfqq->fifo.next);
791
792         if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
793                 cfq_mark_cfqq_fifo_expire(cfqq);
794                 return rq;
795         }
796
797         return NULL;
798 }
799
800 /*
801  * Scale schedule slice based on io priority. Use the sync time slice only
802  * if a queue is marked sync and has sync io queued. A sync queue with async
803  * io only, should not get full sync slice length.
804  */
805 static inline int
806 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
807 {
808         const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
809
810         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
811
812         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
813 }
814
815 static inline void
816 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
817 {
818         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
819 }
820
821 static inline int
822 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
823 {
824         const int base_rq = cfqd->cfq_slice_async_rq;
825
826         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
827
828         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
829 }
830
831 /*
832  * get next queue for service
833  */
834 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
835 {
836         unsigned long now = jiffies;
837         struct cfq_queue *cfqq;
838
839         cfqq = cfqd->active_queue;
840         if (!cfqq)
841                 goto new_queue;
842
843         /*
844          * slice has expired
845          */
846         if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
847                 goto expire;
848
849         /*
850          * if queue has requests, dispatch one. if not, check if
851          * enough slice is left to wait for one
852          */
853         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
854                 goto keep_queue;
855         else if (cfq_cfqq_dispatched(cfqq)) {
856                 cfqq = NULL;
857                 goto keep_queue;
858         } else if (cfq_cfqq_class_sync(cfqq)) {
859                 if (cfq_arm_slice_timer(cfqd, cfqq))
860                         return NULL;
861         }
862
863 expire:
864         cfq_slice_expired(cfqd, 0);
865 new_queue:
866         cfqq = cfq_set_active_queue(cfqd);
867 keep_queue:
868         return cfqq;
869 }
870
871 static int
872 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
873                         int max_dispatch)
874 {
875         int dispatched = 0;
876
877         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
878
879         do {
880                 struct request *rq;
881
882                 /*
883                  * follow expired path, else get first next available
884                  */
885                 if ((rq = cfq_check_fifo(cfqq)) == NULL)
886                         rq = cfqq->next_rq;
887
888                 /*
889                  * finally, insert request into driver dispatch list
890                  */
891                 cfq_dispatch_insert(cfqd->queue, rq);
892
893                 cfqd->dispatch_slice++;
894                 dispatched++;
895
896                 if (!cfqd->active_cic) {
897                         atomic_inc(&RQ_CIC(rq)->ioc->refcount);
898                         cfqd->active_cic = RQ_CIC(rq);
899                 }
900
901                 if (RB_EMPTY_ROOT(&cfqq->sort_list))
902                         break;
903
904         } while (dispatched < max_dispatch);
905
906         /*
907          * if slice end isn't set yet, set it.
908          */
909         if (!cfqq->slice_end)
910                 cfq_set_prio_slice(cfqd, cfqq);
911
912         /*
913          * expire an async queue immediately if it has used up its slice. idle
914          * queue always expire after 1 dispatch round.
915          */
916         if ((!cfq_cfqq_sync(cfqq) &&
917             cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
918             cfq_class_idle(cfqq) ||
919             !cfq_cfqq_idle_window(cfqq))
920                 cfq_slice_expired(cfqd, 0);
921
922         return dispatched;
923 }
924
925 static int
926 cfq_forced_dispatch_cfqqs(struct list_head *list)
927 {
928         struct cfq_queue *cfqq, *next;
929         int dispatched;
930
931         dispatched = 0;
932         list_for_each_entry_safe(cfqq, next, list, cfq_list) {
933                 while (cfqq->next_rq) {
934                         cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
935                         dispatched++;
936                 }
937                 BUG_ON(!list_empty(&cfqq->fifo));
938         }
939
940         return dispatched;
941 }
942
943 static int
944 cfq_forced_dispatch(struct cfq_data *cfqd)
945 {
946         int i, dispatched = 0;
947
948         for (i = 0; i < CFQ_PRIO_LISTS; i++)
949                 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
950
951         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
952         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
953         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
954
955         cfq_slice_expired(cfqd, 0);
956
957         BUG_ON(cfqd->busy_queues);
958
959         return dispatched;
960 }
961
962 static int
963 cfq_dispatch_requests(request_queue_t *q, int force)
964 {
965         struct cfq_data *cfqd = q->elevator->elevator_data;
966         struct cfq_queue *cfqq, *prev_cfqq;
967         int dispatched;
968
969         if (!cfqd->busy_queues)
970                 return 0;
971
972         if (unlikely(force))
973                 return cfq_forced_dispatch(cfqd);
974
975         dispatched = 0;
976         prev_cfqq = NULL;
977         while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
978                 int max_dispatch;
979
980                 /*
981                  * Don't repeat dispatch from the previous queue.
982                  */
983                 if (prev_cfqq == cfqq)
984                         break;
985
986                 cfq_clear_cfqq_must_dispatch(cfqq);
987                 cfq_clear_cfqq_wait_request(cfqq);
988                 del_timer(&cfqd->idle_slice_timer);
989
990                 max_dispatch = cfqd->cfq_quantum;
991                 if (cfq_class_idle(cfqq))
992                         max_dispatch = 1;
993
994                 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
995
996                 /*
997                  * If the dispatch cfqq has idling enabled and is still
998                  * the active queue, break out.
999                  */
1000                 if (cfq_cfqq_idle_window(cfqq) && cfqd->active_queue)
1001                         break;
1002
1003                 prev_cfqq = cfqq;
1004         }
1005
1006         return dispatched;
1007 }
1008
1009 /*
1010  * task holds one reference to the queue, dropped when task exits. each rq
1011  * in-flight on this queue also holds a reference, dropped when rq is freed.
1012  *
1013  * queue lock must be held here.
1014  */
1015 static void cfq_put_queue(struct cfq_queue *cfqq)
1016 {
1017         struct cfq_data *cfqd = cfqq->cfqd;
1018
1019         BUG_ON(atomic_read(&cfqq->ref) <= 0);
1020
1021         if (!atomic_dec_and_test(&cfqq->ref))
1022                 return;
1023
1024         BUG_ON(rb_first(&cfqq->sort_list));
1025         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1026         BUG_ON(cfq_cfqq_on_rr(cfqq));
1027
1028         if (unlikely(cfqd->active_queue == cfqq))
1029                 __cfq_slice_expired(cfqd, cfqq, 0);
1030
1031         /*
1032          * it's on the empty list and still hashed
1033          */
1034         list_del(&cfqq->cfq_list);
1035         hlist_del(&cfqq->cfq_hash);
1036         kmem_cache_free(cfq_pool, cfqq);
1037 }
1038
1039 static struct cfq_queue *
1040 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1041                     const int hashval)
1042 {
1043         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1044         struct hlist_node *entry;
1045         struct cfq_queue *__cfqq;
1046
1047         hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1048                 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1049
1050                 if (__cfqq->key == key && (__p == prio || !prio))
1051                         return __cfqq;
1052         }
1053
1054         return NULL;
1055 }
1056
1057 static struct cfq_queue *
1058 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1059 {
1060         return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1061 }
1062
1063 static void cfq_free_io_context(struct io_context *ioc)
1064 {
1065         struct cfq_io_context *__cic;
1066         struct rb_node *n;
1067         int freed = 0;
1068
1069         while ((n = rb_first(&ioc->cic_root)) != NULL) {
1070                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1071                 rb_erase(&__cic->rb_node, &ioc->cic_root);
1072                 kmem_cache_free(cfq_ioc_pool, __cic);
1073                 freed++;
1074         }
1075
1076         elv_ioc_count_mod(ioc_count, -freed);
1077
1078         if (ioc_gone && !elv_ioc_count_read(ioc_count))
1079                 complete(ioc_gone);
1080 }
1081
1082 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1083 {
1084         if (unlikely(cfqq == cfqd->active_queue))
1085                 __cfq_slice_expired(cfqd, cfqq, 0);
1086
1087         cfq_put_queue(cfqq);
1088 }
1089
1090 static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1091                                          struct cfq_io_context *cic)
1092 {
1093         list_del_init(&cic->queue_list);
1094         smp_wmb();
1095         cic->key = NULL;
1096
1097         if (cic->cfqq[ASYNC]) {
1098                 cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
1099                 cic->cfqq[ASYNC] = NULL;
1100         }
1101
1102         if (cic->cfqq[SYNC]) {
1103                 cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
1104                 cic->cfqq[SYNC] = NULL;
1105         }
1106 }
1107
1108
1109 /*
1110  * Called with interrupts disabled
1111  */
1112 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1113 {
1114         struct cfq_data *cfqd = cic->key;
1115
1116         if (cfqd) {
1117                 request_queue_t *q = cfqd->queue;
1118
1119                 spin_lock_irq(q->queue_lock);
1120                 __cfq_exit_single_io_context(cfqd, cic);
1121                 spin_unlock_irq(q->queue_lock);
1122         }
1123 }
1124
1125 static void cfq_exit_io_context(struct io_context *ioc)
1126 {
1127         struct cfq_io_context *__cic;
1128         struct rb_node *n;
1129
1130         /*
1131          * put the reference this task is holding to the various queues
1132          */
1133
1134         n = rb_first(&ioc->cic_root);
1135         while (n != NULL) {
1136                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1137
1138                 cfq_exit_single_io_context(__cic);
1139                 n = rb_next(n);
1140         }
1141 }
1142
1143 static struct cfq_io_context *
1144 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1145 {
1146         struct cfq_io_context *cic;
1147
1148         cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node);
1149         if (cic) {
1150                 memset(cic, 0, sizeof(*cic));
1151                 cic->last_end_request = jiffies;
1152                 INIT_LIST_HEAD(&cic->queue_list);
1153                 cic->dtor = cfq_free_io_context;
1154                 cic->exit = cfq_exit_io_context;
1155                 elv_ioc_count_inc(ioc_count);
1156         }
1157
1158         return cic;
1159 }
1160
1161 static void cfq_init_prio_data(struct cfq_queue *cfqq)
1162 {
1163         struct task_struct *tsk = current;
1164         int ioprio_class;
1165
1166         if (!cfq_cfqq_prio_changed(cfqq))
1167                 return;
1168
1169         ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1170         switch (ioprio_class) {
1171                 default:
1172                         printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1173                 case IOPRIO_CLASS_NONE:
1174                         /*
1175                          * no prio set, place us in the middle of the BE classes
1176                          */
1177                         cfqq->ioprio = task_nice_ioprio(tsk);
1178                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1179                         break;
1180                 case IOPRIO_CLASS_RT:
1181                         cfqq->ioprio = task_ioprio(tsk);
1182                         cfqq->ioprio_class = IOPRIO_CLASS_RT;
1183                         break;
1184                 case IOPRIO_CLASS_BE:
1185                         cfqq->ioprio = task_ioprio(tsk);
1186                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1187                         break;
1188                 case IOPRIO_CLASS_IDLE:
1189                         cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1190                         cfqq->ioprio = 7;
1191                         cfq_clear_cfqq_idle_window(cfqq);
1192                         break;
1193         }
1194
1195         /*
1196          * keep track of original prio settings in case we have to temporarily
1197          * elevate the priority of this queue
1198          */
1199         cfqq->org_ioprio = cfqq->ioprio;
1200         cfqq->org_ioprio_class = cfqq->ioprio_class;
1201
1202         if (cfq_cfqq_on_rr(cfqq))
1203                 cfq_resort_rr_list(cfqq, 0);
1204
1205         cfq_clear_cfqq_prio_changed(cfqq);
1206 }
1207
1208 static inline void changed_ioprio(struct cfq_io_context *cic)
1209 {
1210         struct cfq_data *cfqd = cic->key;
1211         struct cfq_queue *cfqq;
1212
1213         if (unlikely(!cfqd))
1214                 return;
1215
1216         spin_lock(cfqd->queue->queue_lock);
1217
1218         cfqq = cic->cfqq[ASYNC];
1219         if (cfqq) {
1220                 struct cfq_queue *new_cfqq;
1221                 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task,
1222                                          GFP_ATOMIC);
1223                 if (new_cfqq) {
1224                         cic->cfqq[ASYNC] = new_cfqq;
1225                         cfq_put_queue(cfqq);
1226                 }
1227         }
1228
1229         cfqq = cic->cfqq[SYNC];
1230         if (cfqq)
1231                 cfq_mark_cfqq_prio_changed(cfqq);
1232
1233         spin_unlock(cfqd->queue->queue_lock);
1234 }
1235
1236 static void cfq_ioc_set_ioprio(struct io_context *ioc)
1237 {
1238         struct cfq_io_context *cic;
1239         struct rb_node *n;
1240
1241         ioc->ioprio_changed = 0;
1242
1243         n = rb_first(&ioc->cic_root);
1244         while (n != NULL) {
1245                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1246
1247                 changed_ioprio(cic);
1248                 n = rb_next(n);
1249         }
1250 }
1251
1252 static struct cfq_queue *
1253 cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1254               gfp_t gfp_mask)
1255 {
1256         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1257         struct cfq_queue *cfqq, *new_cfqq = NULL;
1258         unsigned short ioprio;
1259
1260 retry:
1261         ioprio = tsk->ioprio;
1262         cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1263
1264         if (!cfqq) {
1265                 if (new_cfqq) {
1266                         cfqq = new_cfqq;
1267                         new_cfqq = NULL;
1268                 } else if (gfp_mask & __GFP_WAIT) {
1269                         /*
1270                          * Inform the allocator of the fact that we will
1271                          * just repeat this allocation if it fails, to allow
1272                          * the allocator to do whatever it needs to attempt to
1273                          * free memory.
1274                          */
1275                         spin_unlock_irq(cfqd->queue->queue_lock);
1276                         new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node);
1277                         spin_lock_irq(cfqd->queue->queue_lock);
1278                         goto retry;
1279                 } else {
1280                         cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node);
1281                         if (!cfqq)
1282                                 goto out;
1283                 }
1284
1285                 memset(cfqq, 0, sizeof(*cfqq));
1286
1287                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1288                 INIT_LIST_HEAD(&cfqq->cfq_list);
1289                 INIT_LIST_HEAD(&cfqq->fifo);
1290
1291                 cfqq->key = key;
1292                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1293                 atomic_set(&cfqq->ref, 0);
1294                 cfqq->cfqd = cfqd;
1295                 /*
1296                  * set ->slice_left to allow preemption for a new process
1297                  */
1298                 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1299                 cfq_mark_cfqq_idle_window(cfqq);
1300                 cfq_mark_cfqq_prio_changed(cfqq);
1301                 cfq_mark_cfqq_queue_new(cfqq);
1302                 cfq_init_prio_data(cfqq);
1303         }
1304
1305         if (new_cfqq)
1306                 kmem_cache_free(cfq_pool, new_cfqq);
1307
1308         atomic_inc(&cfqq->ref);
1309 out:
1310         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1311         return cfqq;
1312 }
1313
1314 static void
1315 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
1316 {
1317         WARN_ON(!list_empty(&cic->queue_list));
1318         rb_erase(&cic->rb_node, &ioc->cic_root);
1319         kmem_cache_free(cfq_ioc_pool, cic);
1320         elv_ioc_count_dec(ioc_count);
1321 }
1322
1323 static struct cfq_io_context *
1324 cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1325 {
1326         struct rb_node *n;
1327         struct cfq_io_context *cic;
1328         void *k, *key = cfqd;
1329
1330 restart:
1331         n = ioc->cic_root.rb_node;
1332         while (n) {
1333                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1334                 /* ->key must be copied to avoid race with cfq_exit_queue() */
1335                 k = cic->key;
1336                 if (unlikely(!k)) {
1337                         cfq_drop_dead_cic(ioc, cic);
1338                         goto restart;
1339                 }
1340
1341                 if (key < k)
1342                         n = n->rb_left;
1343                 else if (key > k)
1344                         n = n->rb_right;
1345                 else
1346                         return cic;
1347         }
1348
1349         return NULL;
1350 }
1351
1352 static inline void
1353 cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1354              struct cfq_io_context *cic)
1355 {
1356         struct rb_node **p;
1357         struct rb_node *parent;
1358         struct cfq_io_context *__cic;
1359         void *k;
1360
1361         cic->ioc = ioc;
1362         cic->key = cfqd;
1363
1364 restart:
1365         parent = NULL;
1366         p = &ioc->cic_root.rb_node;
1367         while (*p) {
1368                 parent = *p;
1369                 __cic = rb_entry(parent, struct cfq_io_context, rb_node);
1370                 /* ->key must be copied to avoid race with cfq_exit_queue() */
1371                 k = __cic->key;
1372                 if (unlikely(!k)) {
1373                         cfq_drop_dead_cic(ioc, __cic);
1374                         goto restart;
1375                 }
1376
1377                 if (cic->key < k)
1378                         p = &(*p)->rb_left;
1379                 else if (cic->key > k)
1380                         p = &(*p)->rb_right;
1381                 else
1382                         BUG();
1383         }
1384
1385         rb_link_node(&cic->rb_node, parent, p);
1386         rb_insert_color(&cic->rb_node, &ioc->cic_root);
1387
1388         spin_lock_irq(cfqd->queue->queue_lock);
1389         list_add(&cic->queue_list, &cfqd->cic_list);
1390         spin_unlock_irq(cfqd->queue->queue_lock);
1391 }
1392
1393 /*
1394  * Setup general io context and cfq io context. There can be several cfq
1395  * io contexts per general io context, if this process is doing io to more
1396  * than one device managed by cfq.
1397  */
1398 static struct cfq_io_context *
1399 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1400 {
1401         struct io_context *ioc = NULL;
1402         struct cfq_io_context *cic;
1403
1404         might_sleep_if(gfp_mask & __GFP_WAIT);
1405
1406         ioc = get_io_context(gfp_mask, cfqd->queue->node);
1407         if (!ioc)
1408                 return NULL;
1409
1410         cic = cfq_cic_rb_lookup(cfqd, ioc);
1411         if (cic)
1412                 goto out;
1413
1414         cic = cfq_alloc_io_context(cfqd, gfp_mask);
1415         if (cic == NULL)
1416                 goto err;
1417
1418         cfq_cic_link(cfqd, ioc, cic);
1419 out:
1420         smp_read_barrier_depends();
1421         if (unlikely(ioc->ioprio_changed))
1422                 cfq_ioc_set_ioprio(ioc);
1423
1424         return cic;
1425 err:
1426         put_io_context(ioc);
1427         return NULL;
1428 }
1429
1430 static void
1431 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1432 {
1433         unsigned long elapsed, ttime;
1434
1435         /*
1436          * if this context already has stuff queued, thinktime is from
1437          * last queue not last end
1438          */
1439 #if 0
1440         if (time_after(cic->last_end_request, cic->last_queue))
1441                 elapsed = jiffies - cic->last_end_request;
1442         else
1443                 elapsed = jiffies - cic->last_queue;
1444 #else
1445                 elapsed = jiffies - cic->last_end_request;
1446 #endif
1447
1448         ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1449
1450         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1451         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1452         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1453 }
1454
1455 static void
1456 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1457                        struct request *rq)
1458 {
1459         sector_t sdist;
1460         u64 total;
1461
1462         if (cic->last_request_pos < rq->sector)
1463                 sdist = rq->sector - cic->last_request_pos;
1464         else
1465                 sdist = cic->last_request_pos - rq->sector;
1466
1467         /*
1468          * Don't allow the seek distance to get too large from the
1469          * odd fragment, pagein, etc
1470          */
1471         if (cic->seek_samples <= 60) /* second&third seek */
1472                 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1473         else
1474                 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1475
1476         cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1477         cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1478         total = cic->seek_total + (cic->seek_samples/2);
1479         do_div(total, cic->seek_samples);
1480         cic->seek_mean = (sector_t)total;
1481 }
1482
1483 /*
1484  * Disable idle window if the process thinks too long or seeks so much that
1485  * it doesn't matter
1486  */
1487 static void
1488 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1489                        struct cfq_io_context *cic)
1490 {
1491         int enable_idle = cfq_cfqq_idle_window(cfqq);
1492
1493         if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
1494             (cfqd->hw_tag && CIC_SEEKY(cic)))
1495                 enable_idle = 0;
1496         else if (sample_valid(cic->ttime_samples)) {
1497                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1498                         enable_idle = 0;
1499                 else
1500                         enable_idle = 1;
1501         }
1502
1503         if (enable_idle)
1504                 cfq_mark_cfqq_idle_window(cfqq);
1505         else
1506                 cfq_clear_cfqq_idle_window(cfqq);
1507 }
1508
1509
1510 /*
1511  * Check if new_cfqq should preempt the currently active queue. Return 0 for
1512  * no or if we aren't sure, a 1 will cause a preempt.
1513  */
1514 static int
1515 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1516                    struct request *rq)
1517 {
1518         struct cfq_queue *cfqq = cfqd->active_queue;
1519
1520         if (cfq_class_idle(new_cfqq))
1521                 return 0;
1522
1523         if (!cfqq)
1524                 return 0;
1525
1526         if (cfq_class_idle(cfqq))
1527                 return 1;
1528         if (!cfq_cfqq_wait_request(new_cfqq))
1529                 return 0;
1530         /*
1531          * if it doesn't have slice left, forget it
1532          */
1533         if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1534                 return 0;
1535         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
1536                 return 1;
1537
1538         return 0;
1539 }
1540
1541 /*
1542  * cfqq preempts the active queue. if we allowed preempt with no slice left,
1543  * let it have half of its nominal slice.
1544  */
1545 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1546 {
1547         struct cfq_queue *__cfqq, *next;
1548
1549         list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1550                 cfq_resort_rr_list(__cfqq, 1);
1551
1552         if (!cfqq->slice_left)
1553                 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1554
1555         cfqq->slice_end = cfqq->slice_left + jiffies;
1556         cfq_slice_expired(cfqd, 1);
1557         __cfq_set_active_queue(cfqd, cfqq);
1558 }
1559
1560 /*
1561  * should really be a ll_rw_blk.c helper
1562  */
1563 static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1564 {
1565         request_queue_t *q = cfqd->queue;
1566
1567         if (!blk_queue_plugged(q))
1568                 q->request_fn(q);
1569         else
1570                 __generic_unplug_device(q);
1571 }
1572
1573 /*
1574  * Called when a new fs request (rq) is added (to cfqq). Check if there's
1575  * something we should do about it
1576  */
1577 static void
1578 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1579                 struct request *rq)
1580 {
1581         struct cfq_io_context *cic = RQ_CIC(rq);
1582
1583         /*
1584          * check if this request is a better next-serve candidate)) {
1585          */
1586         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
1587         BUG_ON(!cfqq->next_rq);
1588
1589         /*
1590          * we never wait for an async request and we don't allow preemption
1591          * of an async request. so just return early
1592          */
1593         if (!rq_is_sync(rq)) {
1594                 /*
1595                  * sync process issued an async request, if it's waiting
1596                  * then expire it and kick rq handling.
1597                  */
1598                 if (cic == cfqd->active_cic &&
1599                     del_timer(&cfqd->idle_slice_timer)) {
1600                         cfq_slice_expired(cfqd, 0);
1601                         cfq_start_queueing(cfqd, cfqq);
1602                 }
1603                 return;
1604         }
1605
1606         cfq_update_io_thinktime(cfqd, cic);
1607         cfq_update_io_seektime(cfqd, cic, rq);
1608         cfq_update_idle_window(cfqd, cfqq, cic);
1609
1610         cic->last_queue = jiffies;
1611         cic->last_request_pos = rq->sector + rq->nr_sectors;
1612
1613         if (cfqq == cfqd->active_queue) {
1614                 /*
1615                  * if we are waiting for a request for this queue, let it rip
1616                  * immediately and flag that we must not expire this queue
1617                  * just now
1618                  */
1619                 if (cfq_cfqq_wait_request(cfqq)) {
1620                         cfq_mark_cfqq_must_dispatch(cfqq);
1621                         del_timer(&cfqd->idle_slice_timer);
1622                         cfq_start_queueing(cfqd, cfqq);
1623                 }
1624         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
1625                 /*
1626                  * not the active queue - expire current slice if it is
1627                  * idle and has expired it's mean thinktime or this new queue
1628                  * has some old slice time left and is of higher priority
1629                  */
1630                 cfq_preempt_queue(cfqd, cfqq);
1631                 cfq_mark_cfqq_must_dispatch(cfqq);
1632                 cfq_start_queueing(cfqd, cfqq);
1633         }
1634 }
1635
1636 static void cfq_insert_request(request_queue_t *q, struct request *rq)
1637 {
1638         struct cfq_data *cfqd = q->elevator->elevator_data;
1639         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1640
1641         cfq_init_prio_data(cfqq);
1642
1643         cfq_add_rq_rb(rq);
1644
1645         if (!cfq_cfqq_on_rr(cfqq))
1646                 cfq_add_cfqq_rr(cfqd, cfqq);
1647
1648         list_add_tail(&rq->queuelist, &cfqq->fifo);
1649
1650         cfq_rq_enqueued(cfqd, cfqq, rq);
1651 }
1652
1653 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1654 {
1655         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1656         struct cfq_data *cfqd = cfqq->cfqd;
1657         const int sync = rq_is_sync(rq);
1658         unsigned long now;
1659
1660         now = jiffies;
1661
1662         WARN_ON(!cfqd->rq_in_driver);
1663         WARN_ON(!cfqq->on_dispatch[sync]);
1664         cfqd->rq_in_driver--;
1665         cfqq->on_dispatch[sync]--;
1666
1667         if (!cfq_class_idle(cfqq))
1668                 cfqd->last_end_request = now;
1669
1670         if (!cfq_cfqq_dispatched(cfqq) && cfq_cfqq_on_rr(cfqq))
1671                 cfq_resort_rr_list(cfqq, 0);
1672
1673         if (sync)
1674                 RQ_CIC(rq)->last_end_request = now;
1675
1676         /*
1677          * If this is the active queue, check if it needs to be expired,
1678          * or if we want to idle in case it has no pending requests.
1679          */
1680         if (cfqd->active_queue == cfqq) {
1681                 if (time_after(now, cfqq->slice_end))
1682                         cfq_slice_expired(cfqd, 0);
1683                 else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1684                         if (!cfq_arm_slice_timer(cfqd, cfqq))
1685                                 cfq_schedule_dispatch(cfqd);
1686                 }
1687         }
1688 }
1689
1690 /*
1691  * we temporarily boost lower priority queues if they are holding fs exclusive
1692  * resources. they are boosted to normal prio (CLASS_BE/4)
1693  */
1694 static void cfq_prio_boost(struct cfq_queue *cfqq)
1695 {
1696         const int ioprio_class = cfqq->ioprio_class;
1697         const int ioprio = cfqq->ioprio;
1698
1699         if (has_fs_excl()) {
1700                 /*
1701                  * boost idle prio on transactions that would lock out other
1702                  * users of the filesystem
1703                  */
1704                 if (cfq_class_idle(cfqq))
1705                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1706                 if (cfqq->ioprio > IOPRIO_NORM)
1707                         cfqq->ioprio = IOPRIO_NORM;
1708         } else {
1709                 /*
1710                  * check if we need to unboost the queue
1711                  */
1712                 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1713                         cfqq->ioprio_class = cfqq->org_ioprio_class;
1714                 if (cfqq->ioprio != cfqq->org_ioprio)
1715                         cfqq->ioprio = cfqq->org_ioprio;
1716         }
1717
1718         /*
1719          * refile between round-robin lists if we moved the priority class
1720          */
1721         if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1722             cfq_cfqq_on_rr(cfqq))
1723                 cfq_resort_rr_list(cfqq, 0);
1724 }
1725
1726 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
1727 {
1728         if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1729             !cfq_cfqq_must_alloc_slice(cfqq)) {
1730                 cfq_mark_cfqq_must_alloc_slice(cfqq);
1731                 return ELV_MQUEUE_MUST;
1732         }
1733
1734         return ELV_MQUEUE_MAY;
1735 }
1736
1737 static int cfq_may_queue(request_queue_t *q, int rw)
1738 {
1739         struct cfq_data *cfqd = q->elevator->elevator_data;
1740         struct task_struct *tsk = current;
1741         struct cfq_queue *cfqq;
1742
1743         /*
1744          * don't force setup of a queue from here, as a call to may_queue
1745          * does not necessarily imply that a request actually will be queued.
1746          * so just lookup a possibly existing queue, or return 'may queue'
1747          * if that fails
1748          */
1749         cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1750         if (cfqq) {
1751                 cfq_init_prio_data(cfqq);
1752                 cfq_prio_boost(cfqq);
1753
1754                 return __cfq_may_queue(cfqq);
1755         }
1756
1757         return ELV_MQUEUE_MAY;
1758 }
1759
1760 /*
1761  * queue lock held here
1762  */
1763 static void cfq_put_request(request_queue_t *q, struct request *rq)
1764 {
1765         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1766
1767         if (cfqq) {
1768                 const int rw = rq_data_dir(rq);
1769
1770                 BUG_ON(!cfqq->allocated[rw]);
1771                 cfqq->allocated[rw]--;
1772
1773                 put_io_context(RQ_CIC(rq)->ioc);
1774
1775                 rq->elevator_private = NULL;
1776                 rq->elevator_private2 = NULL;
1777
1778                 cfq_put_queue(cfqq);
1779         }
1780 }
1781
1782 /*
1783  * Allocate cfq data structures associated with this request.
1784  */
1785 static int
1786 cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
1787 {
1788         struct cfq_data *cfqd = q->elevator->elevator_data;
1789         struct task_struct *tsk = current;
1790         struct cfq_io_context *cic;
1791         const int rw = rq_data_dir(rq);
1792         pid_t key = cfq_queue_pid(tsk, rw);
1793         struct cfq_queue *cfqq;
1794         unsigned long flags;
1795         int is_sync = key != CFQ_KEY_ASYNC;
1796
1797         might_sleep_if(gfp_mask & __GFP_WAIT);
1798
1799         cic = cfq_get_io_context(cfqd, gfp_mask);
1800
1801         spin_lock_irqsave(q->queue_lock, flags);
1802
1803         if (!cic)
1804                 goto queue_fail;
1805
1806         if (!cic->cfqq[is_sync]) {
1807                 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
1808                 if (!cfqq)
1809                         goto queue_fail;
1810
1811                 cic->cfqq[is_sync] = cfqq;
1812         } else
1813                 cfqq = cic->cfqq[is_sync];
1814
1815         cfqq->allocated[rw]++;
1816         cfq_clear_cfqq_must_alloc(cfqq);
1817         atomic_inc(&cfqq->ref);
1818
1819         spin_unlock_irqrestore(q->queue_lock, flags);
1820
1821         rq->elevator_private = cic;
1822         rq->elevator_private2 = cfqq;
1823         return 0;
1824
1825 queue_fail:
1826         if (cic)
1827                 put_io_context(cic->ioc);
1828
1829         cfq_schedule_dispatch(cfqd);
1830         spin_unlock_irqrestore(q->queue_lock, flags);
1831         return 1;
1832 }
1833
1834 static void cfq_kick_queue(void *data)
1835 {
1836         request_queue_t *q = data;
1837         unsigned long flags;
1838
1839         spin_lock_irqsave(q->queue_lock, flags);
1840         blk_remove_plug(q);
1841         q->request_fn(q);
1842         spin_unlock_irqrestore(q->queue_lock, flags);
1843 }
1844
1845 /*
1846  * Timer running if the active_queue is currently idling inside its time slice
1847  */
1848 static void cfq_idle_slice_timer(unsigned long data)
1849 {
1850         struct cfq_data *cfqd = (struct cfq_data *) data;
1851         struct cfq_queue *cfqq;
1852         unsigned long flags;
1853
1854         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1855
1856         if ((cfqq = cfqd->active_queue) != NULL) {
1857                 unsigned long now = jiffies;
1858
1859                 /*
1860                  * expired
1861                  */
1862                 if (time_after(now, cfqq->slice_end))
1863                         goto expire;
1864
1865                 /*
1866                  * only expire and reinvoke request handler, if there are
1867                  * other queues with pending requests
1868                  */
1869                 if (!cfqd->busy_queues)
1870                         goto out_cont;
1871
1872                 /*
1873                  * not expired and it has a request pending, let it dispatch
1874                  */
1875                 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
1876                         cfq_mark_cfqq_must_dispatch(cfqq);
1877                         goto out_kick;
1878                 }
1879         }
1880 expire:
1881         cfq_slice_expired(cfqd, 0);
1882 out_kick:
1883         cfq_schedule_dispatch(cfqd);
1884 out_cont:
1885         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1886 }
1887
1888 /*
1889  * Timer running if an idle class queue is waiting for service
1890  */
1891 static void cfq_idle_class_timer(unsigned long data)
1892 {
1893         struct cfq_data *cfqd = (struct cfq_data *) data;
1894         unsigned long flags, end;
1895
1896         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1897
1898         /*
1899          * race with a non-idle queue, reset timer
1900          */
1901         end = cfqd->last_end_request + CFQ_IDLE_GRACE;
1902         if (!time_after_eq(jiffies, end))
1903                 mod_timer(&cfqd->idle_class_timer, end);
1904         else
1905                 cfq_schedule_dispatch(cfqd);
1906
1907         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1908 }
1909
1910 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
1911 {
1912         del_timer_sync(&cfqd->idle_slice_timer);
1913         del_timer_sync(&cfqd->idle_class_timer);
1914         blk_sync_queue(cfqd->queue);
1915 }
1916
1917 static void cfq_exit_queue(elevator_t *e)
1918 {
1919         struct cfq_data *cfqd = e->elevator_data;
1920         request_queue_t *q = cfqd->queue;
1921
1922         cfq_shutdown_timer_wq(cfqd);
1923
1924         spin_lock_irq(q->queue_lock);
1925
1926         if (cfqd->active_queue)
1927                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
1928
1929         while (!list_empty(&cfqd->cic_list)) {
1930                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
1931                                                         struct cfq_io_context,
1932                                                         queue_list);
1933
1934                 __cfq_exit_single_io_context(cfqd, cic);
1935         }
1936
1937         spin_unlock_irq(q->queue_lock);
1938
1939         cfq_shutdown_timer_wq(cfqd);
1940
1941         kfree(cfqd->cfq_hash);
1942         kfree(cfqd);
1943 }
1944
1945 static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
1946 {
1947         struct cfq_data *cfqd;
1948         int i;
1949
1950         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
1951         if (!cfqd)
1952                 return NULL;
1953
1954         memset(cfqd, 0, sizeof(*cfqd));
1955
1956         for (i = 0; i < CFQ_PRIO_LISTS; i++)
1957                 INIT_LIST_HEAD(&cfqd->rr_list[i]);
1958
1959         INIT_LIST_HEAD(&cfqd->busy_rr);
1960         INIT_LIST_HEAD(&cfqd->cur_rr);
1961         INIT_LIST_HEAD(&cfqd->idle_rr);
1962         INIT_LIST_HEAD(&cfqd->empty_list);
1963         INIT_LIST_HEAD(&cfqd->cic_list);
1964
1965         cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
1966         if (!cfqd->cfq_hash)
1967                 goto out_free;
1968
1969         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1970                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1971
1972         cfqd->queue = q;
1973
1974         init_timer(&cfqd->idle_slice_timer);
1975         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
1976         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
1977
1978         init_timer(&cfqd->idle_class_timer);
1979         cfqd->idle_class_timer.function = cfq_idle_class_timer;
1980         cfqd->idle_class_timer.data = (unsigned long) cfqd;
1981
1982         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
1983
1984         cfqd->cfq_quantum = cfq_quantum;
1985         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
1986         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
1987         cfqd->cfq_back_max = cfq_back_max;
1988         cfqd->cfq_back_penalty = cfq_back_penalty;
1989         cfqd->cfq_slice[0] = cfq_slice_async;
1990         cfqd->cfq_slice[1] = cfq_slice_sync;
1991         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
1992         cfqd->cfq_slice_idle = cfq_slice_idle;
1993
1994         return cfqd;
1995 out_free:
1996         kfree(cfqd);
1997         return NULL;
1998 }
1999
2000 static void cfq_slab_kill(void)
2001 {
2002         if (cfq_pool)
2003                 kmem_cache_destroy(cfq_pool);
2004         if (cfq_ioc_pool)
2005                 kmem_cache_destroy(cfq_ioc_pool);
2006 }
2007
2008 static int __init cfq_slab_setup(void)
2009 {
2010         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2011                                         NULL, NULL);
2012         if (!cfq_pool)
2013                 goto fail;
2014
2015         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2016                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2017         if (!cfq_ioc_pool)
2018                 goto fail;
2019
2020         return 0;
2021 fail:
2022         cfq_slab_kill();
2023         return -ENOMEM;
2024 }
2025
2026 /*
2027  * sysfs parts below -->
2028  */
2029
2030 static ssize_t
2031 cfq_var_show(unsigned int var, char *page)
2032 {
2033         return sprintf(page, "%d\n", var);
2034 }
2035
2036 static ssize_t
2037 cfq_var_store(unsigned int *var, const char *page, size_t count)
2038 {
2039         char *p = (char *) page;
2040
2041         *var = simple_strtoul(p, &p, 10);
2042         return count;
2043 }
2044
2045 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2046 static ssize_t __FUNC(elevator_t *e, char *page)                        \
2047 {                                                                       \
2048         struct cfq_data *cfqd = e->elevator_data;                       \
2049         unsigned int __data = __VAR;                                    \
2050         if (__CONV)                                                     \
2051                 __data = jiffies_to_msecs(__data);                      \
2052         return cfq_var_show(__data, (page));                            \
2053 }
2054 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2055 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2056 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2057 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2058 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2059 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2060 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2061 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2062 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2063 #undef SHOW_FUNCTION
2064
2065 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2066 static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)    \
2067 {                                                                       \
2068         struct cfq_data *cfqd = e->elevator_data;                       \
2069         unsigned int __data;                                            \
2070         int ret = cfq_var_store(&__data, (page), count);                \
2071         if (__data < (MIN))                                             \
2072                 __data = (MIN);                                         \
2073         else if (__data > (MAX))                                        \
2074                 __data = (MAX);                                         \
2075         if (__CONV)                                                     \
2076                 *(__PTR) = msecs_to_jiffies(__data);                    \
2077         else                                                            \
2078                 *(__PTR) = __data;                                      \
2079         return ret;                                                     \
2080 }
2081 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2082 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2083 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2084 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2085 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2086 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2087 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2088 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2089 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2090 #undef STORE_FUNCTION
2091
2092 #define CFQ_ATTR(name) \
2093         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2094
2095 static struct elv_fs_entry cfq_attrs[] = {
2096         CFQ_ATTR(quantum),
2097         CFQ_ATTR(fifo_expire_sync),
2098         CFQ_ATTR(fifo_expire_async),
2099         CFQ_ATTR(back_seek_max),
2100         CFQ_ATTR(back_seek_penalty),
2101         CFQ_ATTR(slice_sync),
2102         CFQ_ATTR(slice_async),
2103         CFQ_ATTR(slice_async_rq),
2104         CFQ_ATTR(slice_idle),
2105         __ATTR_NULL
2106 };
2107
2108 static struct elevator_type iosched_cfq = {
2109         .ops = {
2110                 .elevator_merge_fn =            cfq_merge,
2111                 .elevator_merged_fn =           cfq_merged_request,
2112                 .elevator_merge_req_fn =        cfq_merged_requests,
2113                 .elevator_dispatch_fn =         cfq_dispatch_requests,
2114                 .elevator_add_req_fn =          cfq_insert_request,
2115                 .elevator_activate_req_fn =     cfq_activate_request,
2116                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
2117                 .elevator_queue_empty_fn =      cfq_queue_empty,
2118                 .elevator_completed_req_fn =    cfq_completed_request,
2119                 .elevator_former_req_fn =       elv_rb_former_request,
2120                 .elevator_latter_req_fn =       elv_rb_latter_request,
2121                 .elevator_set_req_fn =          cfq_set_request,
2122                 .elevator_put_req_fn =          cfq_put_request,
2123                 .elevator_may_queue_fn =        cfq_may_queue,
2124                 .elevator_init_fn =             cfq_init_queue,
2125                 .elevator_exit_fn =             cfq_exit_queue,
2126                 .trim =                         cfq_free_io_context,
2127         },
2128         .elevator_attrs =       cfq_attrs,
2129         .elevator_name =        "cfq",
2130         .elevator_owner =       THIS_MODULE,
2131 };
2132
2133 static int __init cfq_init(void)
2134 {
2135         int ret;
2136
2137         /*
2138          * could be 0 on HZ < 1000 setups
2139          */
2140         if (!cfq_slice_async)
2141                 cfq_slice_async = 1;
2142         if (!cfq_slice_idle)
2143                 cfq_slice_idle = 1;
2144
2145         if (cfq_slab_setup())
2146                 return -ENOMEM;
2147
2148         ret = elv_register(&iosched_cfq);
2149         if (ret)
2150                 cfq_slab_kill();
2151
2152         return ret;
2153 }
2154
2155 static void __exit cfq_exit(void)
2156 {
2157         DECLARE_COMPLETION(all_gone);
2158         elv_unregister(&iosched_cfq);
2159         ioc_gone = &all_gone;
2160         /* ioc_gone's update must be visible before reading ioc_count */
2161         smp_wmb();
2162         if (elv_ioc_count_read(ioc_count))
2163                 wait_for_completion(ioc_gone);
2164         synchronize_rcu();
2165         cfq_slab_kill();
2166 }
2167
2168 module_init(cfq_init);
2169 module_exit(cfq_exit);
2170
2171 MODULE_AUTHOR("Jens Axboe");
2172 MODULE_LICENSE("GPL");
2173 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");