netns xfrm: fix "ip xfrm state|policy count" misreport
[safe/jmp/linux-2.6] / net / sched / sch_api.c
index 341d558..75fd1c6 100644 (file)
@@ -97,10 +97,9 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 
    Auxiliary routines:
 
-   ---requeue
+   ---peek
 
-   requeues once dequeued packet. It is used for non-standard or
-   just buggy devices, which can defer output even if netif_queue_stopped()=0.
+   like dequeue but without removing a packet from the queue
 
    ---reset
 
@@ -147,8 +146,14 @@ int register_qdisc(struct Qdisc_ops *qops)
 
        if (qops->enqueue == NULL)
                qops->enqueue = noop_qdisc_ops.enqueue;
-       if (qops->requeue == NULL)
-               qops->requeue = noop_qdisc_ops.requeue;
+       if (qops->peek == NULL) {
+               if (qops->dequeue == NULL) {
+                       qops->peek = noop_qdisc_ops.peek;
+               } else {
+                       rc = -EINVAL;
+                       goto out;
+               }
+       }
        if (qops->dequeue == NULL)
                qops->dequeue = noop_qdisc_ops.dequeue;
 
@@ -184,7 +189,7 @@ EXPORT_SYMBOL(unregister_qdisc);
    (root qdisc, all its children, children of children etc.)
  */
 
-struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
+static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 {
        struct Qdisc *q;
 
@@ -199,52 +204,29 @@ struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
        return NULL;
 }
 
-/*
- * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
- * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
- */
-static DEFINE_SPINLOCK(qdisc_list_lock);
-
 static void qdisc_list_add(struct Qdisc *q)
 {
-       if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
-               spin_lock_bh(&qdisc_list_lock);
-               list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
-               spin_unlock_bh(&qdisc_list_lock);
-       }
+       if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
+               list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 }
 
 void qdisc_list_del(struct Qdisc *q)
 {
-       if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
-               spin_lock_bh(&qdisc_list_lock);
+       if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
                list_del(&q->list);
-               spin_unlock_bh(&qdisc_list_lock);
-       }
 }
 EXPORT_SYMBOL(qdisc_list_del);
 
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 {
-       unsigned int i;
        struct Qdisc *q;
 
-       spin_lock_bh(&qdisc_list_lock);
-
-       for (i = 0; i < dev->num_tx_queues; i++) {
-               struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-               struct Qdisc *txq_root = txq->qdisc_sleeping;
-
-               q = qdisc_match_from_root(txq_root, handle);
-               if (q)
-                       goto unlock;
-       }
+       q = qdisc_match_from_root(dev->qdisc, handle);
+       if (q)
+               goto out;
 
        q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
-
-unlock:
-       spin_unlock_bh(&qdisc_list_lock);
-
+out:
        return q;
 }
 
@@ -417,6 +399,8 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
        struct nlattr *nest;
 
        nest = nla_nest_start(skb, TCA_STAB);
+       if (nest == NULL)
+               goto nla_put_failure;
        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
        nla_nest_end(skb, nest);
 
@@ -454,13 +438,23 @@ out:
 }
 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
 
+void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
+{
+       if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
+               printk(KERN_WARNING
+                      "%s: %s qdisc %X: is non-work-conserving?\n",
+                      txt, qdisc->ops->id, qdisc->handle >> 16);
+               qdisc->flags |= TCQ_F_WARN_NONWC;
+       }
+}
+EXPORT_SYMBOL(qdisc_warn_nonwc);
+
 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 {
        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
                                                 timer);
 
        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
-       smp_wmb();
        __netif_schedule(qdisc_root(wd->qdisc));
 
        return HRTIMER_NORESTART;
@@ -484,7 +478,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 
        wd->qdisc->flags |= TCQ_F_THROTTLED;
        time = ktime_set(0, 0);
-       time = ktime_add_ns(time, PSCHED_US2NS(expires));
+       time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 }
 EXPORT_SYMBOL(qdisc_watchdog_schedule);
@@ -616,32 +610,6 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
        return i>0 ? autohandle : 0;
 }
 
-/* Attach toplevel qdisc to device queue. */
-
-static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
-                                    struct Qdisc *qdisc)
-{
-       struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
-       spinlock_t *root_lock;
-
-       root_lock = qdisc_lock(oqdisc);
-       spin_lock_bh(root_lock);
-
-       /* Prune old scheduler */
-       if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
-               qdisc_reset(oqdisc);
-
-       /* ... and graft new one */
-       if (qdisc == NULL)
-               qdisc = &noop_qdisc;
-       dev_queue->qdisc_sleeping = qdisc;
-       dev_queue->qdisc = &noop_qdisc;
-
-       spin_unlock_bh(root_lock);
-
-       return oqdisc;
-}
-
 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 {
        const struct Qdisc_class_ops *cops;
@@ -710,6 +678,11 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
                if (dev->flags & IFF_UP)
                        dev_deactivate(dev);
 
+               if (new && new->ops->attach) {
+                       new->ops->attach(new);
+                       num_q = 0;
+               }
+
                for (i = 0; i < num_q; i++) {
                        struct netdev_queue *dev_queue = &dev->rx_queue;
 
@@ -720,6 +693,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
                        if (new && i > 0)
                                atomic_inc(&new->refcnt);
 
+                       if (!ingress)
+                               qdisc_destroy(old);
+               }
+
+               if (!ingress) {
+                       notify_and_destroy(skb, n, classid, dev->qdisc, new);
+                       if (new && !new->ops->attach)
+                               atomic_inc(&new->refcnt);
+                       dev->qdisc = new ? : &noop_qdisc;
+               } else {
                        notify_and_destroy(skb, n, classid, old, new);
                }
 
@@ -728,14 +711,14 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
        } else {
                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
-               err = -EINVAL;
-
-               if (cops) {
+               err = -EOPNOTSUPP;
+               if (cops && cops->graft) {
                        unsigned long cl = cops->get(parent, classid);
                        if (cl) {
                                err = cops->graft(parent, cl, new, &old);
                                cops->put(parent, cl);
-                       }
+                       } else
+                               err = -ENOENT;
                }
                if (!err)
                        notify_and_destroy(skb, n, classid, old, new);
@@ -755,7 +738,8 @@ static struct lock_class_key qdisc_rx_lock;
 
 static struct Qdisc *
 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
-            u32 parent, u32 handle, struct nlattr **tca, int *errp)
+            struct Qdisc *p, u32 parent, u32 handle,
+            struct nlattr **tca, int *errp)
 {
        int err;
        struct nlattr *kind = tca[TCA_KIND];
@@ -764,7 +748,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
        struct qdisc_size_table *stab;
 
        ops = qdisc_lookup_ops(kind);
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
        if (ops == NULL && kind != NULL) {
                char name[IFNAMSIZ];
                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
@@ -825,24 +809,28 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
                        stab = qdisc_get_stab(tca[TCA_STAB]);
                        if (IS_ERR(stab)) {
                                err = PTR_ERR(stab);
-                               goto err_out3;
+                               goto err_out4;
                        }
                        sch->stab = stab;
                }
                if (tca[TCA_RATE]) {
+                       spinlock_t *root_lock;
+
+                       err = -EOPNOTSUPP;
+                       if (sch->flags & TCQ_F_MQROOT)
+                               goto err_out4;
+
+                       if ((sch->parent != TC_H_ROOT) &&
+                           !(sch->flags & TCQ_F_INGRESS) &&
+                           (!p || !(p->flags & TCQ_F_MQROOT)))
+                               root_lock = qdisc_root_sleeping_lock(sch);
+                       else
+                               root_lock = qdisc_lock(sch);
+
                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
-                                               qdisc_root_lock(sch),
-                                               tca[TCA_RATE]);
-                       if (err) {
-                               /*
-                                * Any broken qdiscs that would require
-                                * a ops->reset() here? The qdisc was never
-                                * in action so it shouldn't be necessary.
-                                */
-                               if (ops->destroy)
-                                       ops->destroy(sch);
-                               goto err_out3;
-                       }
+                                               root_lock, tca[TCA_RATE]);
+                       if (err)
+                               goto err_out4;
                }
 
                qdisc_list_add(sch);
@@ -850,7 +838,6 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
                return sch;
        }
 err_out3:
-       qdisc_put_stab(sch->stab);
        dev_put(dev);
        kfree((char *) sch - sch->padded);
 err_out2:
@@ -858,6 +845,16 @@ err_out2:
 err_out:
        *errp = err;
        return NULL;
+
+err_out4:
+       /*
+        * Any broken qdiscs that would require a ops->reset() here?
+        * The qdisc was never in action so it shouldn't be necessary.
+        */
+       qdisc_put_stab(sch->stab);
+       if (ops->destroy)
+               ops->destroy(sch);
+       goto err_out3;
 }
 
 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
@@ -882,9 +879,16 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
        qdisc_put_stab(sch->stab);
        sch->stab = stab;
 
-       if (tca[TCA_RATE])
+       if (tca[TCA_RATE]) {
+               /* NB: ignores errors from replace_estimator
+                  because change can't be undone. */
+               if (sch->flags & TCQ_F_MQROOT)
+                       goto out;
                gen_replace_estimator(&sch->bstats, &sch->rate_est,
-                                     qdisc_root_lock(sch), tca[TCA_RATE]);
+                                           qdisc_root_sleeping_lock(sch),
+                                           tca[TCA_RATE]);
+       }
+out:
        return 0;
 }
 
@@ -943,7 +947,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
        struct Qdisc *p = NULL;
        int err;
 
-       if (net != &init_net)
+       if (!net_eq(net, &init_net))
                return -EINVAL;
 
        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
@@ -963,9 +967,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                                q = dev->rx_queue.qdisc_sleeping;
                        }
                } else {
-                       struct netdev_queue *dev_queue;
-                       dev_queue = netdev_get_tx_queue(dev, 0);
-                       q = dev_queue->qdisc_sleeping;
+                       q = dev->qdisc;
                }
                if (!q)
                        return -ENOENT;
@@ -1007,7 +1009,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
        struct Qdisc *q, *p;
        int err;
 
-       if (net != &init_net)
+       if (!net_eq(net, &init_net))
                return -EINVAL;
 
 replay:
@@ -1033,9 +1035,7 @@ replay:
                                q = dev->rx_queue.qdisc_sleeping;
                        }
                } else {
-                       struct netdev_queue *dev_queue;
-                       dev_queue = netdev_get_tx_queue(dev, 0);
-                       q = dev_queue->qdisc_sleeping;
+                       q = dev->qdisc;
                }
 
                /* It may be default qdisc, ignore it */
@@ -1112,13 +1112,23 @@ create_n_graft:
        if (!(n->nlmsg_flags&NLM_F_CREATE))
                return -ENOENT;
        if (clid == TC_H_INGRESS)
-               q = qdisc_create(dev, &dev->rx_queue,
+               q = qdisc_create(dev, &dev->rx_queue, p,
                                 tcm->tcm_parent, tcm->tcm_parent,
                                 tca, &err);
-       else
-               q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
+       else {
+               struct netdev_queue *dev_queue;
+
+               if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
+                       dev_queue = p->ops->cl_ops->select_queue(p, tcm);
+               else if (p)
+                       dev_queue = p->dev_queue;
+               else
+                       dev_queue = netdev_get_tx_queue(dev, 0);
+
+               q = qdisc_create(dev, dev_queue, p,
                                 tcm->tcm_parent, tcm->tcm_handle,
                                 tca, &err);
+       }
        if (q == NULL) {
                if (err == -EAGAIN)
                        goto replay;
@@ -1161,15 +1171,15 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
        if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
                goto nla_put_failure;
 
-       if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
-                                        TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
+       if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+                                        qdisc_root_sleeping_lock(q), &d) < 0)
                goto nla_put_failure;
 
        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
                goto nla_put_failure;
 
        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
-           gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
+           gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
            gnet_stats_copy_queue(&d, &q->qstats) < 0)
                goto nla_put_failure;
 
@@ -1264,14 +1274,15 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
        int s_idx, s_q_idx;
        struct net_device *dev;
 
-       if (net != &init_net)
+       if (!net_eq(net, &init_net))
                return 0;
 
        s_idx = cb->args[0];
        s_q_idx = q_idx = cb->args[1];
-       read_lock(&dev_base_lock);
+
+       rcu_read_lock();
        idx = 0;
-       for_each_netdev(&init_net, dev) {
+       for_each_netdev_rcu(&init_net, dev) {
                struct netdev_queue *dev_queue;
 
                if (idx < s_idx)
@@ -1280,8 +1291,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
                        s_q_idx = 0;
                q_idx = 0;
 
-               dev_queue = netdev_get_tx_queue(dev, 0);
-               if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+               if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
                        goto done;
 
                dev_queue = &dev->rx_queue;
@@ -1293,7 +1303,7 @@ cont:
        }
 
 done:
-       read_unlock(&dev_base_lock);
+       rcu_read_unlock();
 
        cb->args[0] = idx;
        cb->args[1] = q_idx;
@@ -1312,7 +1322,6 @@ done:
 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 {
        struct net *net = sock_net(skb->sk);
-       struct netdev_queue *dev_queue;
        struct tcmsg *tcm = NLMSG_DATA(n);
        struct nlattr *tca[TCA_MAX + 1];
        struct net_device *dev;
@@ -1325,7 +1334,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
        u32 qid = TC_H_MAJ(clid);
        int err;
 
-       if (net != &init_net)
+       if (!net_eq(net, &init_net))
                return -EINVAL;
 
        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
@@ -1350,7 +1359,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
        /* Step 1. Determine qdisc handle X:0 */
 
-       dev_queue = netdev_get_tx_queue(dev, 0);
        if (pid != TC_H_ROOT) {
                u32 qid1 = TC_H_MAJ(pid);
 
@@ -1361,7 +1369,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                } else if (qid1) {
                        qid = qid1;
                } else if (qid == 0)
-                       qid = dev_queue->qdisc_sleeping->handle;
+                       qid = dev->qdisc->handle;
 
                /* Now qid is genuine qdisc handle consistent
                   both with parent and child.
@@ -1372,7 +1380,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                        pid = TC_H_MAKE(qid, pid);
        } else {
                if (qid == 0)
-                       qid = dev_queue->qdisc_sleeping->handle;
+                       qid = dev->qdisc->handle;
        }
 
        /* OK. Locate qdisc */
@@ -1406,7 +1414,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                                goto out;
                        break;
                case RTM_DELTCLASS:
-                       err = cops->delete(q, cl);
+                       err = -EOPNOTSUPP;
+                       if (cops->delete)
+                               err = cops->delete(q, cl);
                        if (err == 0)
                                tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
                        goto out;
@@ -1420,7 +1430,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
        }
 
        new_cl = cl;
-       err = cops->change(q, clid, pid, tca, &new_cl);
+       err = -EOPNOTSUPP;
+       if (cops->change)
+               err = cops->change(q, clid, pid, tca, &new_cl);
        if (err == 0)
                tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
 
@@ -1445,6 +1457,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
        tcm = NLMSG_DATA(nlh);
        tcm->tcm_family = AF_UNSPEC;
+       tcm->tcm__pad1 = 0;
+       tcm->tcm__pad2 = 0;
        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
        tcm->tcm_parent = q->handle;
        tcm->tcm_handle = q->handle;
@@ -1453,8 +1467,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
                goto nla_put_failure;
 
-       if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
-                                        TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
+       if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+                                        qdisc_root_sleeping_lock(q), &d) < 0)
                goto nla_put_failure;
 
        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
@@ -1562,7 +1576,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
        struct net_device *dev;
        int t, s_t;
 
-       if (net != &init_net)
+       if (!net_eq(net, &init_net))
                return 0;
 
        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
@@ -1573,8 +1587,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
        s_t = cb->args[0];
        t = 0;
 
-       dev_queue = netdev_get_tx_queue(dev, 0);
-       if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+       if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
                goto done;
 
        dev_queue = &dev->rx_queue;
@@ -1669,7 +1682,7 @@ static int psched_show(struct seq_file *seq, void *v)
 
        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
        seq_printf(seq, "%08x %08x %08x %08x\n",
-                  (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
+                  (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
                   1000000,
                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
 
@@ -1694,6 +1707,7 @@ static int __init pktsched_init(void)
 {
        register_qdisc(&pfifo_qdisc_ops);
        register_qdisc(&bfifo_qdisc_ops);
+       register_qdisc(&mq_qdisc_ops);
        proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
 
        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);