blkio: Changes to IO controller additional stats patches
[safe/jmp/linux-2.6] / block / blk-cgroup.c
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *                    Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  *                    Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include "blk-cgroup.h"
20
21 #define MAX_KEY_LEN 100
22
23 static DEFINE_SPINLOCK(blkio_list_lock);
24 static LIST_HEAD(blkio_list);
25
26 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
27 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
28
29 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
30                                                   struct cgroup *);
31 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
32                               struct task_struct *, bool);
33 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
34                            struct cgroup *, struct task_struct *, bool);
35 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
36 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
37
38 struct cgroup_subsys blkio_subsys = {
39         .name = "blkio",
40         .create = blkiocg_create,
41         .can_attach = blkiocg_can_attach,
42         .attach = blkiocg_attach,
43         .destroy = blkiocg_destroy,
44         .populate = blkiocg_populate,
45 #ifdef CONFIG_BLK_CGROUP
46         /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
47         .subsys_id = blkio_subsys_id,
48 #endif
49         .use_id = 1,
50         .module = THIS_MODULE,
51 };
52 EXPORT_SYMBOL_GPL(blkio_subsys);
53
54 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
55 {
56         return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
57                             struct blkio_cgroup, css);
58 }
59 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
60
61 void blkio_group_init(struct blkio_group *blkg)
62 {
63         spin_lock_init(&blkg->stats_lock);
64 }
65 EXPORT_SYMBOL_GPL(blkio_group_init);
66
67 /*
68  * Add to the appropriate stat variable depending on the request type.
69  * This should be called with the blkg->stats_lock held.
70  */
71 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
72                                 bool sync)
73 {
74         if (direction)
75                 stat[BLKIO_STAT_WRITE] += add;
76         else
77                 stat[BLKIO_STAT_READ] += add;
78         if (sync)
79                 stat[BLKIO_STAT_SYNC] += add;
80         else
81                 stat[BLKIO_STAT_ASYNC] += add;
82 }
83
84 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
85 {
86         unsigned long flags;
87
88         spin_lock_irqsave(&blkg->stats_lock, flags);
89         blkg->stats.time += time;
90         spin_unlock_irqrestore(&blkg->stats_lock, flags);
91 }
92 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
93
94 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
95                                 uint64_t bytes, bool direction, bool sync)
96 {
97         struct blkio_group_stats *stats;
98         unsigned long flags;
99
100         spin_lock_irqsave(&blkg->stats_lock, flags);
101         stats = &blkg->stats;
102         stats->sectors += bytes >> 9;
103         blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
104                         sync);
105         blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
106                         direction, sync);
107         spin_unlock_irqrestore(&blkg->stats_lock, flags);
108 }
109 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
110
111 void blkiocg_update_completion_stats(struct blkio_group *blkg,
112         uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
113 {
114         struct blkio_group_stats *stats;
115         unsigned long flags;
116         unsigned long long now = sched_clock();
117
118         spin_lock_irqsave(&blkg->stats_lock, flags);
119         stats = &blkg->stats;
120         if (time_after64(now, io_start_time))
121                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
122                                 now - io_start_time, direction, sync);
123         if (time_after64(io_start_time, start_time))
124                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
125                                 io_start_time - start_time, direction, sync);
126         spin_unlock_irqrestore(&blkg->stats_lock, flags);
127 }
128 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
129
130 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
131                         struct blkio_group *blkg, void *key, dev_t dev)
132 {
133         unsigned long flags;
134
135         spin_lock_irqsave(&blkcg->lock, flags);
136         rcu_assign_pointer(blkg->key, key);
137         blkg->blkcg_id = css_id(&blkcg->css);
138         hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
139         spin_unlock_irqrestore(&blkcg->lock, flags);
140 #ifdef CONFIG_DEBUG_BLK_CGROUP
141         /* Need to take css reference ? */
142         cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
143 #endif
144         blkg->dev = dev;
145 }
146 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
147
148 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
149 {
150         hlist_del_init_rcu(&blkg->blkcg_node);
151         blkg->blkcg_id = 0;
152 }
153
154 /*
155  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
156  * indicating that blk_group was unhashed by the time we got to it.
157  */
158 int blkiocg_del_blkio_group(struct blkio_group *blkg)
159 {
160         struct blkio_cgroup *blkcg;
161         unsigned long flags;
162         struct cgroup_subsys_state *css;
163         int ret = 1;
164
165         rcu_read_lock();
166         css = css_lookup(&blkio_subsys, blkg->blkcg_id);
167         if (!css)
168                 goto out;
169
170         blkcg = container_of(css, struct blkio_cgroup, css);
171         spin_lock_irqsave(&blkcg->lock, flags);
172         if (!hlist_unhashed(&blkg->blkcg_node)) {
173                 __blkiocg_del_blkio_group(blkg);
174                 ret = 0;
175         }
176         spin_unlock_irqrestore(&blkcg->lock, flags);
177 out:
178         rcu_read_unlock();
179         return ret;
180 }
181 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
182
183 /* called under rcu_read_lock(). */
184 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
185 {
186         struct blkio_group *blkg;
187         struct hlist_node *n;
188         void *__key;
189
190         hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
191                 __key = blkg->key;
192                 if (__key == key)
193                         return blkg;
194         }
195
196         return NULL;
197 }
198 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
199
200 #define SHOW_FUNCTION(__VAR)                                            \
201 static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
202                                        struct cftype *cftype)           \
203 {                                                                       \
204         struct blkio_cgroup *blkcg;                                     \
205                                                                         \
206         blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
207         return (u64)blkcg->__VAR;                                       \
208 }
209
210 SHOW_FUNCTION(weight);
211 #undef SHOW_FUNCTION
212
213 static int
214 blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
215 {
216         struct blkio_cgroup *blkcg;
217         struct blkio_group *blkg;
218         struct hlist_node *n;
219         struct blkio_policy_type *blkiop;
220
221         if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
222                 return -EINVAL;
223
224         blkcg = cgroup_to_blkio_cgroup(cgroup);
225         spin_lock(&blkio_list_lock);
226         spin_lock_irq(&blkcg->lock);
227         blkcg->weight = (unsigned int)val;
228         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
229                 list_for_each_entry(blkiop, &blkio_list, list)
230                         blkiop->ops.blkio_update_group_weight_fn(blkg,
231                                         blkcg->weight);
232         }
233         spin_unlock_irq(&blkcg->lock);
234         spin_unlock(&blkio_list_lock);
235         return 0;
236 }
237
238 static int
239 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
240 {
241         struct blkio_cgroup *blkcg;
242         struct blkio_group *blkg;
243         struct hlist_node *n;
244         struct blkio_group_stats *stats;
245
246         blkcg = cgroup_to_blkio_cgroup(cgroup);
247         spin_lock_irq(&blkcg->lock);
248         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
249                 spin_lock(&blkg->stats_lock);
250                 stats = &blkg->stats;
251                 memset(stats, 0, sizeof(struct blkio_group_stats));
252                 spin_unlock(&blkg->stats_lock);
253         }
254         spin_unlock_irq(&blkcg->lock);
255         return 0;
256 }
257
258 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
259                                 int chars_left, bool diskname_only)
260 {
261         snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
262         chars_left -= strlen(str);
263         if (chars_left <= 0) {
264                 printk(KERN_WARNING
265                         "Possibly incorrect cgroup stat display format");
266                 return;
267         }
268         if (diskname_only)
269                 return;
270         switch (type) {
271         case BLKIO_STAT_READ:
272                 strlcat(str, " Read", chars_left);
273                 break;
274         case BLKIO_STAT_WRITE:
275                 strlcat(str, " Write", chars_left);
276                 break;
277         case BLKIO_STAT_SYNC:
278                 strlcat(str, " Sync", chars_left);
279                 break;
280         case BLKIO_STAT_ASYNC:
281                 strlcat(str, " Async", chars_left);
282                 break;
283         case BLKIO_STAT_TOTAL:
284                 strlcat(str, " Total", chars_left);
285                 break;
286         default:
287                 strlcat(str, " Invalid", chars_left);
288         }
289 }
290
291 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
292                                 struct cgroup_map_cb *cb, dev_t dev)
293 {
294         blkio_get_key_name(0, dev, str, chars_left, true);
295         cb->fill(cb, str, val);
296         return val;
297 }
298
299 /* This should be called with blkg->stats_lock held */
300 static uint64_t blkio_get_stat(struct blkio_group *blkg,
301                 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
302 {
303         uint64_t disk_total;
304         char key_str[MAX_KEY_LEN];
305         enum stat_sub_type sub_type;
306
307         if (type == BLKIO_STAT_TIME)
308                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
309                                         blkg->stats.time, cb, dev);
310         if (type == BLKIO_STAT_SECTORS)
311                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
312                                         blkg->stats.sectors, cb, dev);
313 #ifdef CONFIG_DEBUG_BLK_CGROUP
314         if (type == BLKIO_STAT_DEQUEUE)
315                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
316                                         blkg->stats.dequeue, cb, dev);
317 #endif
318
319         for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
320                         sub_type++) {
321                 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
322                 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
323         }
324         disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
325                         blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
326         blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
327         cb->fill(cb, key_str, disk_total);
328         return disk_total;
329 }
330
331 #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)                \
332 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
333                 struct cftype *cftype, struct cgroup_map_cb *cb)        \
334 {                                                                       \
335         struct blkio_cgroup *blkcg;                                     \
336         struct blkio_group *blkg;                                       \
337         struct hlist_node *n;                                           \
338         uint64_t cgroup_total = 0;                                      \
339                                                                         \
340         if (!cgroup_lock_live_group(cgroup))                            \
341                 return -ENODEV;                                         \
342                                                                         \
343         blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
344         rcu_read_lock();                                                \
345         hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
346                 if (blkg->dev) {                                        \
347                         spin_lock_irq(&blkg->stats_lock);               \
348                         cgroup_total += blkio_get_stat(blkg, cb,        \
349                                                 blkg->dev, type);       \
350                         spin_unlock_irq(&blkg->stats_lock);             \
351                 }                                                       \
352         }                                                               \
353         if (show_total)                                                 \
354                 cb->fill(cb, "Total", cgroup_total);                    \
355         rcu_read_unlock();                                              \
356         cgroup_unlock();                                                \
357         return 0;                                                       \
358 }
359
360 SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
361 SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
362 SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
363 SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
364 SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
365 SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
366 #ifdef CONFIG_DEBUG_BLK_CGROUP
367 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
368 #endif
369 #undef SHOW_FUNCTION_PER_GROUP
370
371 #ifdef CONFIG_DEBUG_BLK_CGROUP
372 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
373                         unsigned long dequeue)
374 {
375         blkg->stats.dequeue += dequeue;
376 }
377 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
378 #endif
379
380 struct cftype blkio_files[] = {
381         {
382                 .name = "weight",
383                 .read_u64 = blkiocg_weight_read,
384                 .write_u64 = blkiocg_weight_write,
385         },
386         {
387                 .name = "time",
388                 .read_map = blkiocg_time_read,
389         },
390         {
391                 .name = "sectors",
392                 .read_map = blkiocg_sectors_read,
393         },
394         {
395                 .name = "io_service_bytes",
396                 .read_map = blkiocg_io_service_bytes_read,
397         },
398         {
399                 .name = "io_serviced",
400                 .read_map = blkiocg_io_serviced_read,
401         },
402         {
403                 .name = "io_service_time",
404                 .read_map = blkiocg_io_service_time_read,
405         },
406         {
407                 .name = "io_wait_time",
408                 .read_map = blkiocg_io_wait_time_read,
409         },
410         {
411                 .name = "reset_stats",
412                 .write_u64 = blkiocg_reset_stats,
413         },
414 #ifdef CONFIG_DEBUG_BLK_CGROUP
415        {
416                 .name = "dequeue",
417                 .read_map = blkiocg_dequeue_read,
418        },
419 #endif
420 };
421
422 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
423 {
424         return cgroup_add_files(cgroup, subsys, blkio_files,
425                                 ARRAY_SIZE(blkio_files));
426 }
427
428 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
429 {
430         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
431         unsigned long flags;
432         struct blkio_group *blkg;
433         void *key;
434         struct blkio_policy_type *blkiop;
435
436         rcu_read_lock();
437 remove_entry:
438         spin_lock_irqsave(&blkcg->lock, flags);
439
440         if (hlist_empty(&blkcg->blkg_list)) {
441                 spin_unlock_irqrestore(&blkcg->lock, flags);
442                 goto done;
443         }
444
445         blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
446                                 blkcg_node);
447         key = rcu_dereference(blkg->key);
448         __blkiocg_del_blkio_group(blkg);
449
450         spin_unlock_irqrestore(&blkcg->lock, flags);
451
452         /*
453          * This blkio_group is being unlinked as associated cgroup is going
454          * away. Let all the IO controlling policies know about this event.
455          *
456          * Currently this is static call to one io controlling policy. Once
457          * we have more policies in place, we need some dynamic registration
458          * of callback function.
459          */
460         spin_lock(&blkio_list_lock);
461         list_for_each_entry(blkiop, &blkio_list, list)
462                 blkiop->ops.blkio_unlink_group_fn(key, blkg);
463         spin_unlock(&blkio_list_lock);
464         goto remove_entry;
465 done:
466         free_css_id(&blkio_subsys, &blkcg->css);
467         rcu_read_unlock();
468         if (blkcg != &blkio_root_cgroup)
469                 kfree(blkcg);
470 }
471
472 static struct cgroup_subsys_state *
473 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
474 {
475         struct blkio_cgroup *blkcg, *parent_blkcg;
476
477         if (!cgroup->parent) {
478                 blkcg = &blkio_root_cgroup;
479                 goto done;
480         }
481
482         /* Currently we do not support hierarchy deeper than two level (0,1) */
483         parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
484         if (css_depth(&parent_blkcg->css) > 0)
485                 return ERR_PTR(-EINVAL);
486
487         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
488         if (!blkcg)
489                 return ERR_PTR(-ENOMEM);
490
491         blkcg->weight = BLKIO_WEIGHT_DEFAULT;
492 done:
493         spin_lock_init(&blkcg->lock);
494         INIT_HLIST_HEAD(&blkcg->blkg_list);
495
496         return &blkcg->css;
497 }
498
499 /*
500  * We cannot support shared io contexts, as we have no mean to support
501  * two tasks with the same ioc in two different groups without major rework
502  * of the main cic data structures.  For now we allow a task to change
503  * its cgroup only if it's the only owner of its ioc.
504  */
505 static int blkiocg_can_attach(struct cgroup_subsys *subsys,
506                                 struct cgroup *cgroup, struct task_struct *tsk,
507                                 bool threadgroup)
508 {
509         struct io_context *ioc;
510         int ret = 0;
511
512         /* task_lock() is needed to avoid races with exit_io_context() */
513         task_lock(tsk);
514         ioc = tsk->io_context;
515         if (ioc && atomic_read(&ioc->nr_tasks) > 1)
516                 ret = -EINVAL;
517         task_unlock(tsk);
518
519         return ret;
520 }
521
522 static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
523                                 struct cgroup *prev, struct task_struct *tsk,
524                                 bool threadgroup)
525 {
526         struct io_context *ioc;
527
528         task_lock(tsk);
529         ioc = tsk->io_context;
530         if (ioc)
531                 ioc->cgroup_changed = 1;
532         task_unlock(tsk);
533 }
534
535 void blkio_policy_register(struct blkio_policy_type *blkiop)
536 {
537         spin_lock(&blkio_list_lock);
538         list_add_tail(&blkiop->list, &blkio_list);
539         spin_unlock(&blkio_list_lock);
540 }
541 EXPORT_SYMBOL_GPL(blkio_policy_register);
542
543 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
544 {
545         spin_lock(&blkio_list_lock);
546         list_del_init(&blkiop->list);
547         spin_unlock(&blkio_list_lock);
548 }
549 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
550
551 static int __init init_cgroup_blkio(void)
552 {
553         return cgroup_load_subsys(&blkio_subsys);
554 }
555
556 static void __exit exit_cgroup_blkio(void)
557 {
558         cgroup_unload_subsys(&blkio_subsys);
559 }
560
561 module_init(init_cgroup_blkio);
562 module_exit(exit_cgroup_blkio);
563 MODULE_LICENSE("GPL");