[NET_SCHED]: Fix qdisc_restart return value when dequeue is empty
[safe/jmp/linux-2.6] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52         spin_lock_bh(&dev->queue_lock);
53         spin_lock(&dev->ingress_lock);
54 }
55
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58         spin_unlock(&dev->ingress_lock);
59         spin_unlock_bh(&dev->queue_lock);
60 }
61
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65
66    netif_tx_lock serializes accesses to device driver.
67
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71
72
73 /* Kick device.
74
75    Returns:  0  - queue is empty or throttled.
76             >0  - queue is not empty.
77
78    NOTE: Called under dev->queue_lock with locally disabled BH.
79 */
80
81 static inline int qdisc_restart(struct net_device *dev)
82 {
83         struct Qdisc *q = dev->qdisc;
84         struct sk_buff *skb;
85
86         /* Dequeue packet */
87         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
88                 unsigned nolock = (dev->features & NETIF_F_LLTX);
89
90                 dev->gso_skb = NULL;
91
92                 /*
93                  * When the driver has LLTX set it does its own locking
94                  * in start_xmit. No need to add additional overhead by
95                  * locking again. These checks are worth it because
96                  * even uncongested locks can be quite expensive.
97                  * The driver can do trylock like here too, in case
98                  * of lock congestion it should return -1 and the packet
99                  * will be requeued.
100                  */
101                 if (!nolock) {
102                         if (!netif_tx_trylock(dev)) {
103                         collision:
104                                 /* So, someone grabbed the driver. */
105
106                                 /* It may be transient configuration error,
107                                    when hard_start_xmit() recurses. We detect
108                                    it by checking xmit owner and drop the
109                                    packet when deadloop is detected.
110                                 */
111                                 if (dev->xmit_lock_owner == smp_processor_id()) {
112                                         kfree_skb(skb);
113                                         if (net_ratelimit())
114                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115                                         goto out;
116                                 }
117                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
118                                 goto requeue;
119                         }
120                 }
121
122                 {
123                         /* And release queue */
124                         spin_unlock(&dev->queue_lock);
125
126                         if (!netif_queue_stopped(dev)) {
127                                 int ret;
128
129                                 ret = dev_hard_start_xmit(skb, dev);
130                                 if (ret == NETDEV_TX_OK) {
131                                         if (!nolock) {
132                                                 netif_tx_unlock(dev);
133                                         }
134                                         spin_lock(&dev->queue_lock);
135                                         q = dev->qdisc;
136                                         goto out;
137                                 }
138                                 if (ret == NETDEV_TX_LOCKED && nolock) {
139                                         spin_lock(&dev->queue_lock);
140                                         q = dev->qdisc;
141                                         goto collision;
142                                 }
143                         }
144
145                         /* NETDEV_TX_BUSY - we need to requeue */
146                         /* Release the driver */
147                         if (!nolock) {
148                                 netif_tx_unlock(dev);
149                         }
150                         spin_lock(&dev->queue_lock);
151                         q = dev->qdisc;
152                 }
153
154                 /* Device kicked us out :(
155                    This is possible in three cases:
156
157                    0. driver is locked
158                    1. fastroute is enabled
159                    2. device cannot determine busy state
160                       before start of transmission (f.e. dialout)
161                    3. device is buggy (ppp)
162                  */
163
164 requeue:
165                 if (unlikely(q == &noop_qdisc))
166                         kfree_skb(skb);
167                 else if (skb->next)
168                         dev->gso_skb = skb;
169                 else
170                         q->ops->requeue(skb, q);
171                 netif_schedule(dev);
172         }
173         return 0;
174
175 out:
176         BUG_ON((int) q->q.qlen < 0);
177         return q->q.qlen;
178 }
179
180 void __qdisc_run(struct net_device *dev)
181 {
182         do {
183                 if (!qdisc_restart(dev))
184                         break;
185         } while (!netif_queue_stopped(dev));
186
187         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
188 }
189
190 static void dev_watchdog(unsigned long arg)
191 {
192         struct net_device *dev = (struct net_device *)arg;
193
194         netif_tx_lock(dev);
195         if (dev->qdisc != &noop_qdisc) {
196                 if (netif_device_present(dev) &&
197                     netif_running(dev) &&
198                     netif_carrier_ok(dev)) {
199                         if (netif_queue_stopped(dev) &&
200                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
201
202                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
203                                        dev->name);
204                                 dev->tx_timeout(dev);
205                         }
206                         if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
207                                 dev_hold(dev);
208                 }
209         }
210         netif_tx_unlock(dev);
211
212         dev_put(dev);
213 }
214
215 static void dev_watchdog_init(struct net_device *dev)
216 {
217         init_timer(&dev->watchdog_timer);
218         dev->watchdog_timer.data = (unsigned long)dev;
219         dev->watchdog_timer.function = dev_watchdog;
220 }
221
222 void __netdev_watchdog_up(struct net_device *dev)
223 {
224         if (dev->tx_timeout) {
225                 if (dev->watchdog_timeo <= 0)
226                         dev->watchdog_timeo = 5*HZ;
227                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
228                         dev_hold(dev);
229         }
230 }
231
232 static void dev_watchdog_up(struct net_device *dev)
233 {
234         __netdev_watchdog_up(dev);
235 }
236
237 static void dev_watchdog_down(struct net_device *dev)
238 {
239         netif_tx_lock_bh(dev);
240         if (del_timer(&dev->watchdog_timer))
241                 dev_put(dev);
242         netif_tx_unlock_bh(dev);
243 }
244
245 void netif_carrier_on(struct net_device *dev)
246 {
247         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
248                 linkwatch_fire_event(dev);
249         if (netif_running(dev))
250                 __netdev_watchdog_up(dev);
251 }
252
253 void netif_carrier_off(struct net_device *dev)
254 {
255         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
256                 linkwatch_fire_event(dev);
257 }
258
259 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
260    under all circumstances. It is difficult to invent anything faster or
261    cheaper.
262  */
263
264 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
265 {
266         kfree_skb(skb);
267         return NET_XMIT_CN;
268 }
269
270 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
271 {
272         return NULL;
273 }
274
275 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
276 {
277         if (net_ratelimit())
278                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
279                        skb->dev->name);
280         kfree_skb(skb);
281         return NET_XMIT_CN;
282 }
283
284 struct Qdisc_ops noop_qdisc_ops = {
285         .id             =       "noop",
286         .priv_size      =       0,
287         .enqueue        =       noop_enqueue,
288         .dequeue        =       noop_dequeue,
289         .requeue        =       noop_requeue,
290         .owner          =       THIS_MODULE,
291 };
292
293 struct Qdisc noop_qdisc = {
294         .enqueue        =       noop_enqueue,
295         .dequeue        =       noop_dequeue,
296         .flags          =       TCQ_F_BUILTIN,
297         .ops            =       &noop_qdisc_ops,
298         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
299 };
300
301 static struct Qdisc_ops noqueue_qdisc_ops = {
302         .id             =       "noqueue",
303         .priv_size      =       0,
304         .enqueue        =       noop_enqueue,
305         .dequeue        =       noop_dequeue,
306         .requeue        =       noop_requeue,
307         .owner          =       THIS_MODULE,
308 };
309
310 static struct Qdisc noqueue_qdisc = {
311         .enqueue        =       NULL,
312         .dequeue        =       noop_dequeue,
313         .flags          =       TCQ_F_BUILTIN,
314         .ops            =       &noqueue_qdisc_ops,
315         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
316 };
317
318
319 static const u8 prio2band[TC_PRIO_MAX+1] =
320         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
321
322 /* 3-band FIFO queue: old style, but should be a bit faster than
323    generic prio+fifo combination.
324  */
325
326 #define PFIFO_FAST_BANDS 3
327
328 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
329                                              struct Qdisc *qdisc)
330 {
331         struct sk_buff_head *list = qdisc_priv(qdisc);
332         return list + prio2band[skb->priority & TC_PRIO_MAX];
333 }
334
335 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
336 {
337         struct sk_buff_head *list = prio2list(skb, qdisc);
338
339         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
340                 qdisc->q.qlen++;
341                 return __qdisc_enqueue_tail(skb, qdisc, list);
342         }
343
344         return qdisc_drop(skb, qdisc);
345 }
346
347 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
348 {
349         int prio;
350         struct sk_buff_head *list = qdisc_priv(qdisc);
351
352         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
353                 if (!skb_queue_empty(list + prio)) {
354                         qdisc->q.qlen--;
355                         return __qdisc_dequeue_head(qdisc, list + prio);
356                 }
357         }
358
359         return NULL;
360 }
361
362 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
363 {
364         qdisc->q.qlen++;
365         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
366 }
367
368 static void pfifo_fast_reset(struct Qdisc* qdisc)
369 {
370         int prio;
371         struct sk_buff_head *list = qdisc_priv(qdisc);
372
373         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
374                 __qdisc_reset_queue(qdisc, list + prio);
375
376         qdisc->qstats.backlog = 0;
377         qdisc->q.qlen = 0;
378 }
379
380 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
381 {
382         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
383
384         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
385         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
386         return skb->len;
387
388 rtattr_failure:
389         return -1;
390 }
391
392 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
393 {
394         int prio;
395         struct sk_buff_head *list = qdisc_priv(qdisc);
396
397         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
398                 skb_queue_head_init(list + prio);
399
400         return 0;
401 }
402
403 static struct Qdisc_ops pfifo_fast_ops = {
404         .id             =       "pfifo_fast",
405         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
406         .enqueue        =       pfifo_fast_enqueue,
407         .dequeue        =       pfifo_fast_dequeue,
408         .requeue        =       pfifo_fast_requeue,
409         .init           =       pfifo_fast_init,
410         .reset          =       pfifo_fast_reset,
411         .dump           =       pfifo_fast_dump,
412         .owner          =       THIS_MODULE,
413 };
414
415 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
416 {
417         void *p;
418         struct Qdisc *sch;
419         unsigned int size;
420         int err = -ENOBUFS;
421
422         /* ensure that the Qdisc and the private data are 32-byte aligned */
423         size = QDISC_ALIGN(sizeof(*sch));
424         size += ops->priv_size + (QDISC_ALIGNTO - 1);
425
426         p = kzalloc(size, GFP_KERNEL);
427         if (!p)
428                 goto errout;
429         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
430         sch->padded = (char *) sch - (char *) p;
431
432         INIT_LIST_HEAD(&sch->list);
433         skb_queue_head_init(&sch->q);
434         sch->ops = ops;
435         sch->enqueue = ops->enqueue;
436         sch->dequeue = ops->dequeue;
437         sch->dev = dev;
438         dev_hold(dev);
439         atomic_set(&sch->refcnt, 1);
440
441         return sch;
442 errout:
443         return ERR_PTR(-err);
444 }
445
446 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
447                                  unsigned int parentid)
448 {
449         struct Qdisc *sch;
450
451         sch = qdisc_alloc(dev, ops);
452         if (IS_ERR(sch))
453                 goto errout;
454         sch->stats_lock = &dev->queue_lock;
455         sch->parent = parentid;
456
457         if (!ops->init || ops->init(sch, NULL) == 0)
458                 return sch;
459
460         qdisc_destroy(sch);
461 errout:
462         return NULL;
463 }
464
465 /* Under dev->queue_lock and BH! */
466
467 void qdisc_reset(struct Qdisc *qdisc)
468 {
469         struct Qdisc_ops *ops = qdisc->ops;
470
471         if (ops->reset)
472                 ops->reset(qdisc);
473 }
474
475 /* this is the rcu callback function to clean up a qdisc when there
476  * are no further references to it */
477
478 static void __qdisc_destroy(struct rcu_head *head)
479 {
480         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
481         kfree((char *) qdisc - qdisc->padded);
482 }
483
484 /* Under dev->queue_lock and BH! */
485
486 void qdisc_destroy(struct Qdisc *qdisc)
487 {
488         struct Qdisc_ops  *ops = qdisc->ops;
489
490         if (qdisc->flags & TCQ_F_BUILTIN ||
491             !atomic_dec_and_test(&qdisc->refcnt))
492                 return;
493
494         list_del(&qdisc->list);
495 #ifdef CONFIG_NET_ESTIMATOR
496         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
497 #endif
498         if (ops->reset)
499                 ops->reset(qdisc);
500         if (ops->destroy)
501                 ops->destroy(qdisc);
502
503         module_put(ops->owner);
504         dev_put(qdisc->dev);
505         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
506 }
507
508 void dev_activate(struct net_device *dev)
509 {
510         /* No queueing discipline is attached to device;
511            create default one i.e. pfifo_fast for devices,
512            which need queueing and noqueue_qdisc for
513            virtual interfaces
514          */
515
516         if (dev->qdisc_sleeping == &noop_qdisc) {
517                 struct Qdisc *qdisc;
518                 if (dev->tx_queue_len) {
519                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
520                                                   TC_H_ROOT);
521                         if (qdisc == NULL) {
522                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
523                                 return;
524                         }
525                         list_add_tail(&qdisc->list, &dev->qdisc_list);
526                 } else {
527                         qdisc =  &noqueue_qdisc;
528                 }
529                 dev->qdisc_sleeping = qdisc;
530         }
531
532         if (!netif_carrier_ok(dev))
533                 /* Delay activation until next carrier-on event */
534                 return;
535
536         spin_lock_bh(&dev->queue_lock);
537         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
538         if (dev->qdisc != &noqueue_qdisc) {
539                 dev->trans_start = jiffies;
540                 dev_watchdog_up(dev);
541         }
542         spin_unlock_bh(&dev->queue_lock);
543 }
544
545 void dev_deactivate(struct net_device *dev)
546 {
547         struct Qdisc *qdisc;
548         struct sk_buff *skb;
549
550         spin_lock_bh(&dev->queue_lock);
551         qdisc = dev->qdisc;
552         dev->qdisc = &noop_qdisc;
553
554         qdisc_reset(qdisc);
555
556         skb = dev->gso_skb;
557         dev->gso_skb = NULL;
558         spin_unlock_bh(&dev->queue_lock);
559
560         kfree_skb(skb);
561
562         dev_watchdog_down(dev);
563
564         /* Wait for outstanding dev_queue_xmit calls. */
565         synchronize_rcu();
566
567         /* Wait for outstanding qdisc_run calls. */
568         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
569                 yield();
570 }
571
572 void dev_init_scheduler(struct net_device *dev)
573 {
574         qdisc_lock_tree(dev);
575         dev->qdisc = &noop_qdisc;
576         dev->qdisc_sleeping = &noop_qdisc;
577         INIT_LIST_HEAD(&dev->qdisc_list);
578         qdisc_unlock_tree(dev);
579
580         dev_watchdog_init(dev);
581 }
582
583 void dev_shutdown(struct net_device *dev)
584 {
585         struct Qdisc *qdisc;
586
587         qdisc_lock_tree(dev);
588         qdisc = dev->qdisc_sleeping;
589         dev->qdisc = &noop_qdisc;
590         dev->qdisc_sleeping = &noop_qdisc;
591         qdisc_destroy(qdisc);
592 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
593         if ((qdisc = dev->qdisc_ingress) != NULL) {
594                 dev->qdisc_ingress = NULL;
595                 qdisc_destroy(qdisc);
596         }
597 #endif
598         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
599         qdisc_unlock_tree(dev);
600 }
601
602 EXPORT_SYMBOL(netif_carrier_on);
603 EXPORT_SYMBOL(netif_carrier_off);
604 EXPORT_SYMBOL(noop_qdisc);
605 EXPORT_SYMBOL(qdisc_create_dflt);
606 EXPORT_SYMBOL(qdisc_destroy);
607 EXPORT_SYMBOL(qdisc_reset);
608 EXPORT_SYMBOL(qdisc_lock_tree);
609 EXPORT_SYMBOL(qdisc_unlock_tree);