X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fcore%2Fdev.c;h=1845b08c624e1fa4c9dd398f4b61c2963dbfc1f2;hb=b2e75eff5e859d0c294e7405958362b26a423c6e;hp=e8041eb76ac1935adbe35d034fa9d8f9d45a53be;hpb=b0e28f1effd1d840b36e961edc1def81e01b1ca1;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/core/dev.c b/net/core/dev.c index e8041eb..1845b08 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -208,17 +208,17 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *queue) +static inline void rps_lock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_lock(&queue->input_pkt_queue.lock); + spin_lock(&sd->input_pkt_queue.lock); #endif } -static inline void rps_unlock(struct softnet_data *queue) +static inline void rps_unlock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_unlock(&queue->input_pkt_queue.lock); + spin_unlock(&sd->input_pkt_queue.lock); #endif } @@ -264,7 +264,7 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); #ifdef CONFIG_LOCKDEP @@ -954,18 +954,22 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net *net, const char *name, char *buf, - bool fmt) +static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) { + struct net *net; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + if (!dev_valid_name(name)) return -EINVAL; if (fmt && strchr(name, '%')) - return __dev_alloc_name(net, name, buf); + return dev_alloc_name(dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; - else if (buf != name) - strlcpy(buf, name, IFNAMSIZ); + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); return 0; } @@ -997,20 +1001,15 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(net, newname, dev->name, 1); + err = dev_get_valid_name(dev, newname, 1); if (err < 0) return err; rollback: - /* For now only devices in the initial network namespace - * are in sysfs. - */ - if (net_eq(net, &init_net)) { - ret = device_rename(&dev->dev, dev->name); - if (ret) { - memcpy(dev->name, oldname, IFNAMSIZ); - return ret; - } + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; } write_lock_bh(&dev_base_lock); @@ -1435,6 +1434,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } @@ -1453,7 +1453,7 @@ void net_disable_timestamp(void) } EXPORT_SYMBOL(net_disable_timestamp); -static inline void net_timestamp(struct sk_buff *skb) +static inline void net_timestamp_set(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) __net_timestamp(skb); @@ -1461,6 +1461,12 @@ static inline void net_timestamp(struct sk_buff *skb) skb->tstamp.tv64 = 0; } +static inline void net_timestamp_check(struct sk_buff *skb) +{ + if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) + __net_timestamp(skb); +} + /** * dev_forward_skb - loopback an skb to another netif * @@ -1469,7 +1475,7 @@ static inline void net_timestamp(struct sk_buff *skb) * * return values: * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) + * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue @@ -1483,12 +1489,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { skb_orphan(skb); - if (!(dev->flags & IFF_UP)) - return NET_RX_DROP; - - if (skb->len > (dev->mtu + dev->hard_header_len)) + if (!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len))) { + kfree_skb(skb); return NET_RX_DROP; - + } skb_set_dev(skb, dev); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; @@ -1508,9 +1513,9 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) #ifdef CONFIG_NET_CLS_ACT if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) - net_timestamp(skb); + net_timestamp_set(skb); #else - net_timestamp(skb); + net_timestamp_set(skb); #endif rcu_read_lock(); @@ -1556,8 +1561,9 @@ static inline void __netif_reschedule(struct Qdisc *q) local_irq_save(flags); sd = &__get_cpu_var(softnet_data); - q->next_sched = sd->output_queue; - sd->output_queue = q; + q->next_sched = NULL; + *sd->output_queue_tailp = q; + sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } @@ -1880,6 +1886,17 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } +/* + * Try to orphan skb early, right before transmission by the device. + * We cannot orphan skb if tx timestamp is requested, since + * drivers need to call skb_tstamp_tx() to send the timestamp. + */ +static inline void skb_orphan_try(struct sk_buff *skb) +{ + if (!skb_tx(skb)->flags) + skb_orphan(skb); +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { @@ -1890,13 +1907,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); - if (netif_needs_gso(dev, skb)) { - if (unlikely(dev_gso_segment(skb))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } - /* * If device doesnt need skb->dst, release it right now while * its hot in this cpu cache @@ -1904,23 +1914,18 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); + skb_orphan_try(skb); + + if (netif_needs_gso(dev, skb)) { + if (unlikely(dev_gso_segment(skb))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } + rc = ops->ndo_start_xmit(skb, dev); if (rc == NETDEV_TX_OK) txq_trans_update(txq); - /* - * TODO: if skb_orphan() was called by - * dev->hard_start_xmit() (for example, the unmodified - * igb driver does that; bnx2 doesn't), then - * skb_tx_software_timestamp() will be unable to send - * back the time stamp. - * - * How can this be prevented? Always create another - * reference to the socket before calling - * dev->hard_start_xmit()? Prevent that skb_orphan() - * does anything in dev->hard_start_xmit() by clearing - * the skb destructor before the call and restoring it - * afterwards, then doing the skb_orphan() ourselves? - */ return rc; } @@ -1975,7 +1980,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else - hash = skb->protocol; + hash = (__force u16) skb->protocol; hash = jhash_1word(hash, hashrnd); @@ -2015,8 +2020,12 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues > 1) queue_index = skb_tx_hash(dev, skb); - if (sk && rcu_dereference_check(sk->sk_dst_cache, 1)) - sk_tx_queue_set(sk, queue_index); + if (sk) { + struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1); + + if (dst && skb_dst(skb) == dst) + sk_tx_queue_set(sk, queue_index); + } } } @@ -2042,6 +2051,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); __qdisc_update_bstats(q, skb->len); if (sch_direct_xmit(skb, q, dev, txq, root_lock)) __qdisc_run(q); @@ -2050,6 +2061,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { + skb_dst_force(skb); rc = qdisc_enqueue_root(skb, q); qdisc_run(q); } @@ -2197,26 +2209,46 @@ EXPORT_SYMBOL(dev_queue_xmit); =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, + struct napi_struct *napi) +{ + list_add_tail(&napi->poll_list, &sd->poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} #ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) { struct ipv6hdr *ip6; struct iphdr *ip; struct netdev_rx_queue *rxqueue; struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; int cpu = -1; u8 ip_proto; - u32 addr1, addr2, ports, ihl; + u16 tcpu; + u32 addr1, addr2, ihl; + union { + u32 v32; + u16 v16[2]; + } ports; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); @@ -2232,7 +2264,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) } else rxqueue = dev->_rx; - if (!rxqueue->rps_map) + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) goto done; if (skb->rxhash) @@ -2245,8 +2277,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) ip = (struct iphdr *) skb->data; ip_proto = ip->protocol; - addr1 = ip->saddr; - addr2 = ip->daddr; + addr1 = (__force u32) ip->saddr; + addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): @@ -2255,14 +2287,13 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) ip6 = (struct ipv6hdr *) skb->data; ip_proto = ip6->nexthdr; - addr1 = ip6->saddr.s6_addr32[3]; - addr2 = ip6->daddr.s6_addr32[3]; + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; ihl = (40 >> 2); break; default: goto done; } - ports = 0; switch (ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -2271,22 +2302,67 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) - ports = *((u32 *) (skb->data + (ihl * 4))); - break; - + if (pskb_may_pull(skb, (ihl * 4) + 4)) { + ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + if (ports.v16[1] < ports.v16[0]) + swap(ports.v16[0], ports.v16[1]); + break; + } default: + ports.v32 = 0; break; } - skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd); + /* get a consistent hash (same value on both flow directions) */ + if (addr2 < addr1) + swap(addr1, addr2); + skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); if (!skb->rxhash) skb->rxhash = 1; got_hash: + flow_table = rcu_dereference(rxqueue->rps_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (flow_table && sock_flow_table) { + u16 next_cpu; + struct rps_dev_flow *rflow; + + rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + tcpu = rflow->cpu; + + next_cpu = sock_flow_table->ents[skb->rxhash & + sock_flow_table->mask]; + + /* + * If the desired CPU (where last recvmsg was done) is + * different from current CPU (one in the rx-queue flow + * table entry), switch if one of the following holds: + * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is offline. + * - The current CPU's queue tail has advanced beyond the + * last packet that was enqueued using this table entry. + * This guarantees that all previous packets for the flow + * have been dequeued, thus preserving in order delivery. + */ + if (unlikely(tcpu != next_cpu) && + (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + rflow->last_qtail)) >= 0)) { + tcpu = rflow->cpu = next_cpu; + if (tcpu != RPS_NO_CPU) + rflow->last_qtail = per_cpu(softnet_data, + tcpu).input_queue_head; + } + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + *rflowp = rflow; + cpu = tcpu; + goto done; + } + } + map = rcu_dereference(rxqueue->rps_map); if (map) { - u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; if (cpu_online(tcpu)) { cpu = tcpu; @@ -2298,75 +2374,76 @@ done: return cpu; } +/* Called from hardirq (IPI) context */ +static void rps_trigger_softirq(void *data) +{ + struct softnet_data *sd = data; + + ____napi_schedule(sd, &sd->backlog); + sd->received_rps++; +} + +#endif /* CONFIG_RPS */ + /* - * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled - * to be sent to kick remote softirq processing. There are two masks since - * the sending of IPIs must be done with interrupts enabled. The select field - * indicates the current mask that enqueue_backlog uses to schedule IPIs. - * select is flipped before net_rps_action is called while still under lock, - * net_rps_action then uses the non-selected mask to send the IPIs and clears - * it without conflicting with enqueue_backlog operation. + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 */ -struct rps_remote_softirq_cpus { - cpumask_t mask[2]; - int select; -}; -static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus); - -/* Called from hardirq (IPI) context */ -static void trigger_softirq(void *data) +static int rps_ipi_queued(struct softnet_data *sd) { - struct softnet_data *queue = data; - __napi_schedule(&queue->backlog); - __get_cpu_var(netdev_rx_stat).received_rps++; +#ifdef CONFIG_RPS + struct softnet_data *mysd = &__get_cpu_var(softnet_data); + + if (sd != mysd) { + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; + } +#endif /* CONFIG_RPS */ + return 0; } -#endif /* CONFIG_SMP */ /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ -static int enqueue_to_backlog(struct sk_buff *skb, int cpu) +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + unsigned int *qtail) { - struct softnet_data *queue; + struct softnet_data *sd; unsigned long flags; - queue = &per_cpu(softnet_data, cpu); + sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); - __get_cpu_var(netdev_rx_stat).total++; - rps_lock(queue); - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { + rps_lock(sd); + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: - __skb_queue_tail(&queue->input_pkt_queue, skb); - rps_unlock(queue); + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + rps_unlock(sd); local_irq_restore(flags); return NET_RX_SUCCESS; } - /* Schedule NAPI for backlog device */ - if (napi_schedule_prep(&queue->backlog)) { -#ifdef CONFIG_RPS - if (cpu != smp_processor_id()) { - struct rps_remote_softirq_cpus *rcpus = - &__get_cpu_var(rps_remote_softirq_cpus); - - cpu_set(cpu, rcpus->mask[rcpus->select]); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - } else - __napi_schedule(&queue->backlog); -#else - __napi_schedule(&queue->backlog); -#endif + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { + if (!rps_ipi_queued(sd)) + ____napi_schedule(sd, &sd->backlog); } goto enqueue; } - rps_unlock(queue); + sd->dropped++; + rps_unlock(sd); - __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); kfree_skb(skb); @@ -2396,23 +2473,30 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->tstamp.tv64) - net_timestamp(skb); + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); #ifdef CONFIG_RPS { + struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; rcu_read_lock(); - cpu = get_rps_cpu(skb->dev, skb); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); - ret = enqueue_to_backlog(skb, cpu); + + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); } #else - ret = enqueue_to_backlog(skb, get_cpu()); - put_cpu(); + { + unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); + put_cpu(); + } #endif return ret; } @@ -2459,6 +2543,7 @@ static void net_tx_action(struct softirq_action *h) local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); while (head) { @@ -2535,7 +2620,8 @@ static inline struct sk_buff *handle_bridge(struct sk_buff *skb, #endif #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) -struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; +struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p, + struct sk_buff *skb) __read_mostly; EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, @@ -2543,14 +2629,17 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, int *ret, struct net_device *orig_dev) { - if (skb->dev->macvlan_port == NULL) + struct macvlan_port *port; + + port = rcu_dereference(skb->dev->macvlan_port); + if (!port) return skb; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - return macvlan_handle_frame_hook(skb); + return macvlan_handle_frame_hook(port, skb); } #else #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) @@ -2710,8 +2799,8 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; __be16 type; - if (!skb->tstamp.tv64) - net_timestamp(skb); + if (!netdev_tstamp_prequeue) + net_timestamp_check(skb); if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; @@ -2733,7 +2822,7 @@ static int __netif_receive_skb(struct sk_buff *skb) skb->dev = master; } - __get_cpu_var(netdev_rx_stat).total++; + __get_cpu_var(softnet_data).processed++; skb_reset_network_header(skb); skb_reset_transport_header(skb); @@ -2829,35 +2918,60 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); + #ifdef CONFIG_RPS - int cpu; + { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; + + rcu_read_lock(); - cpu = get_rps_cpu(skb->dev, skb); + cpu = get_rps_cpu(skb->dev, skb, &rflow); - if (cpu < 0) - return __netif_receive_skb(skb); - else - return enqueue_to_backlog(skb, cpu); + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + ret = __netif_receive_skb(skb); + } + + return ret; + } #else return __netif_receive_skb(skb); #endif } EXPORT_SYMBOL(netif_receive_skb); -/* Network device is going away, flush any packets still pending */ +/* Network device is going away, flush any packets still pending + * Called with irqs disabled. + */ static void flush_backlog(void *arg) { struct net_device *dev = arg; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &__get_cpu_var(softnet_data); struct sk_buff *skb, *tmp; - rps_lock(queue); - skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) + rps_lock(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->input_pkt_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } + rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev == dev) { - __skb_unlink(skb, &queue->input_pkt_queue); + __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); + input_queue_head_incr(sd); } - rps_unlock(queue); + } } static int napi_gro_complete(struct sk_buff *skb) @@ -3160,30 +3274,87 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* + * net_rps_action sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + + if (remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + __smp_call_function_single(remsd->cpu, + &remsd->csd, 0); + remsd = next; + } + } else +#endif + local_irq_enable(); +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); +#ifdef CONFIG_RPS + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd->rps_ipi_list) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } +#endif napi->weight = weight_p; - do { + local_irq_disable(); + while (work < quota) { struct sk_buff *skb; + unsigned int qlen; - local_irq_disable(); - rps_lock(queue); - skb = __skb_dequeue(&queue->input_pkt_queue); - if (!skb) { - __napi_complete(napi); - rps_unlock(queue); + while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); - break; + __netif_receive_skb(skb); + local_irq_disable(); + input_queue_head_incr(sd); + if (++work >= quota) { + local_irq_enable(); + return work; + } } - rps_unlock(queue); - local_irq_enable(); - __netif_receive_skb(skb); - } while (++work < quota && jiffies == start_time); + rps_lock(sd); + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen) + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + + if (qlen < quota - work) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set on backlog. + * we can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + list_del(&napi->poll_list); + napi->state = 0; + + quota = work + qlen; + } + rps_unlock(sd); + } + local_irq_enable(); return work; } @@ -3199,8 +3370,7 @@ void __napi_schedule(struct napi_struct *n) unsigned long flags; local_irq_save(flags); - list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + ____napi_schedule(&__get_cpu_var(softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); @@ -3271,39 +3441,16 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); -#ifdef CONFIG_RPS -/* - * net_rps_action sends any pending IPI's for rps. This is only called from - * softirq and interrupts must be enabled. - */ -static void net_rps_action(cpumask_t *mask) -{ - int cpu; - - /* Send pending IPI's to kick RPS processing on remote cpus. */ - for_each_cpu_mask_nr(cpu, *mask) { - struct softnet_data *queue = &per_cpu(softnet_data, cpu); - if (cpu_online(cpu)) - __smp_call_function_single(cpu, &queue->csd, 0); - } - cpus_clear(*mask); -} -#endif - static void net_rx_action(struct softirq_action *h) { - struct list_head *list = &__get_cpu_var(softnet_data).poll_list; + struct softnet_data *sd = &__get_cpu_var(softnet_data); unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; -#ifdef CONFIG_RPS - int select; - struct rps_remote_softirq_cpus *rcpus; -#endif local_irq_disable(); - while (!list_empty(list)) { + while (!list_empty(&sd->poll_list)) { struct napi_struct *n; int work, weight; @@ -3321,7 +3468,7 @@ static void net_rx_action(struct softirq_action *h) * entries to the tail of this list, and only ->poll() * calls can remove this head entry from the list. */ - n = list_first_entry(list, struct napi_struct, poll_list); + n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); have = netpoll_poll_lock(n); @@ -3356,23 +3503,13 @@ static void net_rx_action(struct softirq_action *h) napi_complete(n); local_irq_disable(); } else - list_move_tail(&n->poll_list, list); + list_move_tail(&n->poll_list, &sd->poll_list); } netpoll_poll_unlock(have); } out: -#ifdef CONFIG_RPS - rcpus = &__get_cpu_var(rps_remote_softirq_cpus); - select = rcpus->select; - rcpus->select ^= 1; - - local_irq_enable(); - - net_rps_action(&rcpus->mask[select]); -#else - local_irq_enable(); -#endif + net_rps_action_and_irq_enable(sd); #ifdef CONFIG_NET_DMA /* @@ -3385,7 +3522,7 @@ out: return; softnet_break: - __get_cpu_var(netdev_rx_stat).time_squeeze++; + sd->time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } @@ -3586,17 +3723,17 @@ static int dev_seq_show(struct seq_file *seq, void *v) return 0; } -static struct netif_rx_stats *softnet_get_online(loff_t *pos) +static struct softnet_data *softnet_get_online(loff_t *pos) { - struct netif_rx_stats *rc = NULL; + struct softnet_data *sd = NULL; while (*pos < nr_cpu_ids) if (cpu_online(*pos)) { - rc = &per_cpu(netdev_rx_stat, *pos); + sd = &per_cpu(softnet_data, *pos); break; } else ++*pos; - return rc; + return sd; } static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) @@ -3616,12 +3753,12 @@ static void softnet_seq_stop(struct seq_file *seq, void *v) static int softnet_seq_show(struct seq_file *seq, void *v) { - struct netif_rx_stats *s = v; + struct softnet_data *sd = v; seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, 0, + sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - s->cpu_collision, s->received_rps); + sd->cpu_collision, sd->received_rps); return 0; } @@ -4827,7 +4964,7 @@ int register_netdevice(struct net_device *dev) } } - ret = dev_get_valid_name(net, dev->name, dev->name, 0); + ret = dev_get_valid_name(dev, dev->name, 0); if (ret) goto err_uninit; @@ -4856,8 +4993,6 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; - netdev_initialize_kobject(dev); - ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) @@ -5409,15 +5544,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char if (dev->features & NETIF_F_NETNS_LOCAL) goto out; -#ifdef CONFIG_SYSFS - /* Don't allow real devices to be moved when sysfs - * is enabled. - */ - err = -EINVAL; - if (dev->dev.parent) - goto out; -#endif - /* Ensure the device has been registrered */ err = -EINVAL; if (dev->reg_state != NETREG_REGISTERED) @@ -5436,7 +5562,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(net, pat, dev->name, 1)) + if (dev_get_valid_name(dev, pat, 1)) goto out; } @@ -5468,8 +5594,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_uc_flush(dev); dev_mc_flush(dev); - netdev_unregister_kobject(dev); - /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -5482,7 +5606,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char } /* Fixup kobjects */ - err = netdev_register_kobject(dev); + err = device_rename(&dev->dev, dev->name); WARN_ON(err); /* Add the device back in the hashes */ @@ -5509,7 +5633,6 @@ static int dev_cpu_callback(struct notifier_block *nfb, void *ocpu) { struct sk_buff **list_skb; - struct Qdisc **list_net; struct sk_buff *skb; unsigned int cpu, oldcpu = (unsigned long)ocpu; struct softnet_data *sd, *oldsd; @@ -5530,20 +5653,26 @@ static int dev_cpu_callback(struct notifier_block *nfb, *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; - /* Find end of our output_queue. */ - list_net = &sd->output_queue; - while (*list_net) - list_net = &(*list_net)->next_sched; /* Append output queue from offline CPU. */ - *list_net = oldsd->output_queue; - oldsd->output_queue = NULL; + if (oldsd->output_queue) { + *sd->output_queue_tailp = oldsd->output_queue; + sd->output_queue_tailp = oldsd->output_queue_tailp; + oldsd->output_queue = NULL; + oldsd->output_queue_tailp = &oldsd->output_queue; + } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) + while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); + input_queue_head_incr(oldsd); + } + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } return NOTIFY_OK; } @@ -5759,23 +5888,26 @@ static int __init net_dev_init(void) */ for_each_possible_cpu(i) { - struct softnet_data *queue; - - queue = &per_cpu(softnet_data, i); - skb_queue_head_init(&queue->input_pkt_queue); - queue->completion_queue = NULL; - INIT_LIST_HEAD(&queue->poll_list); + struct softnet_data *sd = &per_cpu(softnet_data, i); + memset(sd, 0, sizeof(*sd)); + skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + sd->completion_queue = NULL; + INIT_LIST_HEAD(&sd->poll_list); + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - queue->csd.func = trigger_softirq; - queue->csd.info = queue; - queue->csd.flags = 0; + sd->csd.func = rps_trigger_softirq; + sd->csd.info = sd; + sd->csd.flags = 0; + sd->cpu = i; #endif - queue->backlog.poll = process_backlog; - queue->backlog.weight = weight_p; - queue->backlog.gro_list = NULL; - queue->backlog.gro_count = 0; + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; + sd->backlog.gro_list = NULL; + sd->backlog.gro_count = 0; } dev_boot_phase = 0;