[NET]: Prevent multiple qdisc runs
[safe/jmp/linux-2.6] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/rcupdate.h>
34 #include <linux/list.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
37
38 /* Main transmission queue. */
39
40 /* Main qdisc structure lock. 
41
42    However, modifications
43    to data, participating in scheduling must be additionally
44    protected with dev->queue_lock spinlock.
45
46    The idea is the following:
47    - enqueue, dequeue are serialized via top level device
48      spinlock dev->queue_lock.
49    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
50      and this lock is used only in process context.
51    - updates to tree are made under rtnl semaphore or
52      from softirq context (__qdisc_destroy rcu-callback)
53      hence this lock needs local bh disabling.
54
55    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
56  */
57 DEFINE_RWLOCK(qdisc_tree_lock);
58
59 void qdisc_lock_tree(struct net_device *dev)
60 {
61         write_lock_bh(&qdisc_tree_lock);
62         spin_lock_bh(&dev->queue_lock);
63 }
64
65 void qdisc_unlock_tree(struct net_device *dev)
66 {
67         spin_unlock_bh(&dev->queue_lock);
68         write_unlock_bh(&qdisc_tree_lock);
69 }
70
71 /* 
72    dev->queue_lock serializes queue accesses for this device
73    AND dev->qdisc pointer itself.
74
75    netif_tx_lock serializes accesses to device driver.
76
77    dev->queue_lock and netif_tx_lock are mutually exclusive,
78    if one is grabbed, another must be free.
79  */
80
81
82 /* Kick device.
83    Note, that this procedure can be called by a watchdog timer, so that
84    we do not check dev->tbusy flag here.
85
86    Returns:  0  - queue is empty.
87             >0  - queue is not empty, but throttled.
88             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
89
90    NOTE: Called under dev->queue_lock with locally disabled BH.
91 */
92
93 static inline int qdisc_restart(struct net_device *dev)
94 {
95         struct Qdisc *q = dev->qdisc;
96         struct sk_buff *skb;
97
98         /* Dequeue packet */
99         if ((skb = q->dequeue(q)) != NULL) {
100                 unsigned nolock = (dev->features & NETIF_F_LLTX);
101                 /*
102                  * When the driver has LLTX set it does its own locking
103                  * in start_xmit. No need to add additional overhead by
104                  * locking again. These checks are worth it because
105                  * even uncongested locks can be quite expensive.
106                  * The driver can do trylock like here too, in case
107                  * of lock congestion it should return -1 and the packet
108                  * will be requeued.
109                  */
110                 if (!nolock) {
111                         if (!netif_tx_trylock(dev)) {
112                         collision:
113                                 /* So, someone grabbed the driver. */
114                                 
115                                 /* It may be transient configuration error,
116                                    when hard_start_xmit() recurses. We detect
117                                    it by checking xmit owner and drop the
118                                    packet when deadloop is detected.
119                                 */
120                                 if (dev->xmit_lock_owner == smp_processor_id()) {
121                                         kfree_skb(skb);
122                                         if (net_ratelimit())
123                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
124                                         return -1;
125                                 }
126                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
127                                 goto requeue;
128                         }
129                 }
130                 
131                 {
132                         /* And release queue */
133                         spin_unlock(&dev->queue_lock);
134
135                         if (!netif_queue_stopped(dev)) {
136                                 int ret;
137                                 if (netdev_nit)
138                                         dev_queue_xmit_nit(skb, dev);
139
140                                 ret = dev->hard_start_xmit(skb, dev);
141                                 if (ret == NETDEV_TX_OK) { 
142                                         if (!nolock) {
143                                                 netif_tx_unlock(dev);
144                                         }
145                                         spin_lock(&dev->queue_lock);
146                                         return -1;
147                                 }
148                                 if (ret == NETDEV_TX_LOCKED && nolock) {
149                                         spin_lock(&dev->queue_lock);
150                                         goto collision; 
151                                 }
152                         }
153
154                         /* NETDEV_TX_BUSY - we need to requeue */
155                         /* Release the driver */
156                         if (!nolock) { 
157                                 netif_tx_unlock(dev);
158                         } 
159                         spin_lock(&dev->queue_lock);
160                         q = dev->qdisc;
161                 }
162
163                 /* Device kicked us out :(
164                    This is possible in three cases:
165
166                    0. driver is locked
167                    1. fastroute is enabled
168                    2. device cannot determine busy state
169                       before start of transmission (f.e. dialout)
170                    3. device is buggy (ppp)
171                  */
172
173 requeue:
174                 q->ops->requeue(skb, q);
175                 netif_schedule(dev);
176                 return 1;
177         }
178         BUG_ON((int) q->q.qlen < 0);
179         return q->q.qlen;
180 }
181
182 void __qdisc_run(struct net_device *dev)
183 {
184         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
185                 /* NOTHING */;
186
187         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
188 }
189
190 static void dev_watchdog(unsigned long arg)
191 {
192         struct net_device *dev = (struct net_device *)arg;
193
194         netif_tx_lock(dev);
195         if (dev->qdisc != &noop_qdisc) {
196                 if (netif_device_present(dev) &&
197                     netif_running(dev) &&
198                     netif_carrier_ok(dev)) {
199                         if (netif_queue_stopped(dev) &&
200                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
201
202                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
203                                        dev->name);
204                                 dev->tx_timeout(dev);
205                         }
206                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
207                                 dev_hold(dev);
208                 }
209         }
210         netif_tx_unlock(dev);
211
212         dev_put(dev);
213 }
214
215 static void dev_watchdog_init(struct net_device *dev)
216 {
217         init_timer(&dev->watchdog_timer);
218         dev->watchdog_timer.data = (unsigned long)dev;
219         dev->watchdog_timer.function = dev_watchdog;
220 }
221
222 void __netdev_watchdog_up(struct net_device *dev)
223 {
224         if (dev->tx_timeout) {
225                 if (dev->watchdog_timeo <= 0)
226                         dev->watchdog_timeo = 5*HZ;
227                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
228                         dev_hold(dev);
229         }
230 }
231
232 static void dev_watchdog_up(struct net_device *dev)
233 {
234         netif_tx_lock_bh(dev);
235         __netdev_watchdog_up(dev);
236         netif_tx_unlock_bh(dev);
237 }
238
239 static void dev_watchdog_down(struct net_device *dev)
240 {
241         netif_tx_lock_bh(dev);
242         if (del_timer(&dev->watchdog_timer))
243                 dev_put(dev);
244         netif_tx_unlock_bh(dev);
245 }
246
247 void netif_carrier_on(struct net_device *dev)
248 {
249         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
250                 linkwatch_fire_event(dev);
251         if (netif_running(dev))
252                 __netdev_watchdog_up(dev);
253 }
254
255 void netif_carrier_off(struct net_device *dev)
256 {
257         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
258                 linkwatch_fire_event(dev);
259 }
260
261 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
262    under all circumstances. It is difficult to invent anything faster or
263    cheaper.
264  */
265
266 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
267 {
268         kfree_skb(skb);
269         return NET_XMIT_CN;
270 }
271
272 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
273 {
274         return NULL;
275 }
276
277 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
278 {
279         if (net_ratelimit())
280                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
281                        skb->dev->name);
282         kfree_skb(skb);
283         return NET_XMIT_CN;
284 }
285
286 struct Qdisc_ops noop_qdisc_ops = {
287         .id             =       "noop",
288         .priv_size      =       0,
289         .enqueue        =       noop_enqueue,
290         .dequeue        =       noop_dequeue,
291         .requeue        =       noop_requeue,
292         .owner          =       THIS_MODULE,
293 };
294
295 struct Qdisc noop_qdisc = {
296         .enqueue        =       noop_enqueue,
297         .dequeue        =       noop_dequeue,
298         .flags          =       TCQ_F_BUILTIN,
299         .ops            =       &noop_qdisc_ops,        
300         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
301 };
302
303 static struct Qdisc_ops noqueue_qdisc_ops = {
304         .id             =       "noqueue",
305         .priv_size      =       0,
306         .enqueue        =       noop_enqueue,
307         .dequeue        =       noop_dequeue,
308         .requeue        =       noop_requeue,
309         .owner          =       THIS_MODULE,
310 };
311
312 static struct Qdisc noqueue_qdisc = {
313         .enqueue        =       NULL,
314         .dequeue        =       noop_dequeue,
315         .flags          =       TCQ_F_BUILTIN,
316         .ops            =       &noqueue_qdisc_ops,
317         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
318 };
319
320
321 static const u8 prio2band[TC_PRIO_MAX+1] =
322         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
323
324 /* 3-band FIFO queue: old style, but should be a bit faster than
325    generic prio+fifo combination.
326  */
327
328 #define PFIFO_FAST_BANDS 3
329
330 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
331                                              struct Qdisc *qdisc)
332 {
333         struct sk_buff_head *list = qdisc_priv(qdisc);
334         return list + prio2band[skb->priority & TC_PRIO_MAX];
335 }
336
337 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
338 {
339         struct sk_buff_head *list = prio2list(skb, qdisc);
340
341         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
342                 qdisc->q.qlen++;
343                 return __qdisc_enqueue_tail(skb, qdisc, list);
344         }
345
346         return qdisc_drop(skb, qdisc);
347 }
348
349 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
350 {
351         int prio;
352         struct sk_buff_head *list = qdisc_priv(qdisc);
353
354         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
355                 if (!skb_queue_empty(list + prio)) {
356                         qdisc->q.qlen--;
357                         return __qdisc_dequeue_head(qdisc, list + prio);
358                 }
359         }
360
361         return NULL;
362 }
363
364 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
365 {
366         qdisc->q.qlen++;
367         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
368 }
369
370 static void pfifo_fast_reset(struct Qdisc* qdisc)
371 {
372         int prio;
373         struct sk_buff_head *list = qdisc_priv(qdisc);
374
375         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
376                 __qdisc_reset_queue(qdisc, list + prio);
377
378         qdisc->qstats.backlog = 0;
379         qdisc->q.qlen = 0;
380 }
381
382 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
383 {
384         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
385
386         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
387         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
388         return skb->len;
389
390 rtattr_failure:
391         return -1;
392 }
393
394 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
395 {
396         int prio;
397         struct sk_buff_head *list = qdisc_priv(qdisc);
398
399         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
400                 skb_queue_head_init(list + prio);
401
402         return 0;
403 }
404
405 static struct Qdisc_ops pfifo_fast_ops = {
406         .id             =       "pfifo_fast",
407         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
408         .enqueue        =       pfifo_fast_enqueue,
409         .dequeue        =       pfifo_fast_dequeue,
410         .requeue        =       pfifo_fast_requeue,
411         .init           =       pfifo_fast_init,
412         .reset          =       pfifo_fast_reset,
413         .dump           =       pfifo_fast_dump,
414         .owner          =       THIS_MODULE,
415 };
416
417 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
418 {
419         void *p;
420         struct Qdisc *sch;
421         unsigned int size;
422         int err = -ENOBUFS;
423
424         /* ensure that the Qdisc and the private data are 32-byte aligned */
425         size = QDISC_ALIGN(sizeof(*sch));
426         size += ops->priv_size + (QDISC_ALIGNTO - 1);
427
428         p = kmalloc(size, GFP_KERNEL);
429         if (!p)
430                 goto errout;
431         memset(p, 0, size);
432         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
433         sch->padded = (char *) sch - (char *) p;
434
435         INIT_LIST_HEAD(&sch->list);
436         skb_queue_head_init(&sch->q);
437         sch->ops = ops;
438         sch->enqueue = ops->enqueue;
439         sch->dequeue = ops->dequeue;
440         sch->dev = dev;
441         dev_hold(dev);
442         sch->stats_lock = &dev->queue_lock;
443         atomic_set(&sch->refcnt, 1);
444
445         return sch;
446 errout:
447         return ERR_PTR(-err);
448 }
449
450 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
451 {
452         struct Qdisc *sch;
453         
454         sch = qdisc_alloc(dev, ops);
455         if (IS_ERR(sch))
456                 goto errout;
457
458         if (!ops->init || ops->init(sch, NULL) == 0)
459                 return sch;
460
461         qdisc_destroy(sch);
462 errout:
463         return NULL;
464 }
465
466 /* Under dev->queue_lock and BH! */
467
468 void qdisc_reset(struct Qdisc *qdisc)
469 {
470         struct Qdisc_ops *ops = qdisc->ops;
471
472         if (ops->reset)
473                 ops->reset(qdisc);
474 }
475
476 /* this is the rcu callback function to clean up a qdisc when there 
477  * are no further references to it */
478
479 static void __qdisc_destroy(struct rcu_head *head)
480 {
481         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
482         struct Qdisc_ops  *ops = qdisc->ops;
483
484 #ifdef CONFIG_NET_ESTIMATOR
485         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
486 #endif
487         write_lock(&qdisc_tree_lock);
488         if (ops->reset)
489                 ops->reset(qdisc);
490         if (ops->destroy)
491                 ops->destroy(qdisc);
492         write_unlock(&qdisc_tree_lock);
493         module_put(ops->owner);
494
495         dev_put(qdisc->dev);
496         kfree((char *) qdisc - qdisc->padded);
497 }
498
499 /* Under dev->queue_lock and BH! */
500
501 void qdisc_destroy(struct Qdisc *qdisc)
502 {
503         struct list_head cql = LIST_HEAD_INIT(cql);
504         struct Qdisc *cq, *q, *n;
505
506         if (qdisc->flags & TCQ_F_BUILTIN ||
507                 !atomic_dec_and_test(&qdisc->refcnt))
508                 return;
509
510         if (!list_empty(&qdisc->list)) {
511                 if (qdisc->ops->cl_ops == NULL)
512                         list_del(&qdisc->list);
513                 else
514                         list_move(&qdisc->list, &cql);
515         }
516
517         /* unlink inner qdiscs from dev->qdisc_list immediately */
518         list_for_each_entry(cq, &cql, list)
519                 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
520                         if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
521                                 if (q->ops->cl_ops == NULL)
522                                         list_del_init(&q->list);
523                                 else
524                                         list_move_tail(&q->list, &cql);
525                         }
526         list_for_each_entry_safe(cq, n, &cql, list)
527                 list_del_init(&cq->list);
528
529         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
530 }
531
532 void dev_activate(struct net_device *dev)
533 {
534         /* No queueing discipline is attached to device;
535            create default one i.e. pfifo_fast for devices,
536            which need queueing and noqueue_qdisc for
537            virtual interfaces
538          */
539
540         if (dev->qdisc_sleeping == &noop_qdisc) {
541                 struct Qdisc *qdisc;
542                 if (dev->tx_queue_len) {
543                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
544                         if (qdisc == NULL) {
545                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
546                                 return;
547                         }
548                         write_lock_bh(&qdisc_tree_lock);
549                         list_add_tail(&qdisc->list, &dev->qdisc_list);
550                         write_unlock_bh(&qdisc_tree_lock);
551                 } else {
552                         qdisc =  &noqueue_qdisc;
553                 }
554                 write_lock_bh(&qdisc_tree_lock);
555                 dev->qdisc_sleeping = qdisc;
556                 write_unlock_bh(&qdisc_tree_lock);
557         }
558
559         if (!netif_carrier_ok(dev))
560                 /* Delay activation until next carrier-on event */
561                 return;
562
563         spin_lock_bh(&dev->queue_lock);
564         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
565         if (dev->qdisc != &noqueue_qdisc) {
566                 dev->trans_start = jiffies;
567                 dev_watchdog_up(dev);
568         }
569         spin_unlock_bh(&dev->queue_lock);
570 }
571
572 void dev_deactivate(struct net_device *dev)
573 {
574         struct Qdisc *qdisc;
575
576         spin_lock_bh(&dev->queue_lock);
577         qdisc = dev->qdisc;
578         dev->qdisc = &noop_qdisc;
579
580         qdisc_reset(qdisc);
581
582         spin_unlock_bh(&dev->queue_lock);
583
584         dev_watchdog_down(dev);
585
586         while (test_bit(__LINK_STATE_SCHED, &dev->state))
587                 yield();
588
589         spin_unlock_wait(&dev->_xmit_lock);
590 }
591
592 void dev_init_scheduler(struct net_device *dev)
593 {
594         qdisc_lock_tree(dev);
595         dev->qdisc = &noop_qdisc;
596         dev->qdisc_sleeping = &noop_qdisc;
597         INIT_LIST_HEAD(&dev->qdisc_list);
598         qdisc_unlock_tree(dev);
599
600         dev_watchdog_init(dev);
601 }
602
603 void dev_shutdown(struct net_device *dev)
604 {
605         struct Qdisc *qdisc;
606
607         qdisc_lock_tree(dev);
608         qdisc = dev->qdisc_sleeping;
609         dev->qdisc = &noop_qdisc;
610         dev->qdisc_sleeping = &noop_qdisc;
611         qdisc_destroy(qdisc);
612 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
613         if ((qdisc = dev->qdisc_ingress) != NULL) {
614                 dev->qdisc_ingress = NULL;
615                 qdisc_destroy(qdisc);
616         }
617 #endif
618         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
619         qdisc_unlock_tree(dev);
620 }
621
622 EXPORT_SYMBOL(__netdev_watchdog_up);
623 EXPORT_SYMBOL(netif_carrier_on);
624 EXPORT_SYMBOL(netif_carrier_off);
625 EXPORT_SYMBOL(noop_qdisc);
626 EXPORT_SYMBOL(noop_qdisc_ops);
627 EXPORT_SYMBOL(qdisc_create_dflt);
628 EXPORT_SYMBOL(qdisc_alloc);
629 EXPORT_SYMBOL(qdisc_destroy);
630 EXPORT_SYMBOL(qdisc_reset);
631 EXPORT_SYMBOL(qdisc_lock_tree);
632 EXPORT_SYMBOL(qdisc_unlock_tree);