X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Fip_output.c;h=3d6167fb2d970b1a46db9d1f5be823610fdb0c7d;hb=adf30907d63893e4208dfe3f5c88ae12bc2f25d5;hp=669f5d97c6eb487f771c518e4c7c21c43e98d07b;hpb=bbe735e4247dba32568a305553b010081c8dea99;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 669f5d9..3d6167f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, @@ -75,7 +73,6 @@ #include #include #include -#include #include #include #include @@ -92,6 +89,28 @@ __inline__ void ip_send_check(struct iphdr *iph) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +int __ip_local_out(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->tot_len = htons(skb->len); + ip_send_check(iph); + return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); +} + +int ip_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip_local_out); + /* dev_loopback_xmit for use with netfilter. */ static int ip_dev_loopback_xmit(struct sk_buff *newskb) { @@ -99,7 +118,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_TRAP(newskb->dst); + WARN_ON(!skb_dst(newskb)); netif_rx(newskb); return 0; } @@ -121,13 +140,13 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options *opt) { struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb_rtable(skb); struct iphdr *iph; /* Build the IP header. */ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); skb_reset_network_header(skb); - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; @@ -139,32 +158,36 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->sk_protocol; - iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, sk); if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, daddr, rt, 0); } - ip_send_check(iph); skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; /* Send it out. */ - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; - int hh_len = LL_RESERVED_SPACE(dev); + unsigned int hh_len = LL_RESERVED_SPACE(dev); + + if (rt->rt_type == RTN_MULTICAST) { + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + } else if (rt->rt_type == RTN_BROADCAST) + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ - if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); @@ -189,16 +212,24 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -static inline int ip_finish_output(struct sk_buff *skb) +static inline int ip_skb_dst_mtu(struct sk_buff *skb) +{ + struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + + return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? + skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +} + +static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ - if (skb->dst->xfrm != NULL) { + if (skb_dst(skb)->xfrm != NULL) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } #endif - if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) + if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); @@ -207,13 +238,13 @@ static inline int ip_finish_output(struct sk_buff *skb) int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->u.dst.dev; /* * If the indicated interface is up and running, send the packet. */ - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -238,14 +269,14 @@ int ip_mc_output(struct sk_buff *skb) ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, - newskb->dev, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, ip_dev_loopback_xmit); } /* Multicasts with ttl 0 must not go beyond the host */ - if (skb->nh.iph->ttl == 0) { + if (ip_hdr(skb)->ttl == 0) { kfree_skb(skb); return 0; } @@ -254,25 +285,25 @@ int ip_mc_output(struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, ip_dev_loopback_xmit); } - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; + struct net_device *dev = skb_dst(skb)->dev; - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -288,7 +319,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ - rt = (struct rtable *) skb->dst; + rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; @@ -309,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) .saddr = inet->saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet->sport, .dport = inet->dport } } }; @@ -318,12 +350,12 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) * itself out. */ security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) + if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } sk_setup_caps(sk, &rt->u.dst); } - skb->dst = dst_clone(&rt->u.dst); + skb_dst_set(skb, dst_clone(&rt->u.dst)); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -332,9 +364,8 @@ packet_routed: /* OK, we know where to send it, allocate and build IP header. */ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); skb_reset_network_header(skb); - iph = skb->nh.iph; + iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) iph->frag_off = htons(IP_DF); else @@ -353,16 +384,13 @@ packet_routed: ip_select_ident_more(iph, &rt->u.dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); - /* Add an IP checksum. */ - ip_send_check(iph); - skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); no_route: - IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -373,8 +401,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - dst_release(to->dst); - to->dst = dst_clone(from->dst); + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); to->dev = from->dev; to->mark = from->mark; @@ -384,21 +412,14 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif -#ifdef CONFIG_NETFILTER - /* Connection association is same as pre-frag packet */ - nf_conntrack_put(to->nfct); - to->nfct = from->nfct; - nf_conntrack_get(to->nfct); - to->nfctinfo = from->nfctinfo; + nf_copy(to, from); +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + to->nf_trace = from->nf_trace; +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(to->nf_bridge); - to->nf_bridge = from->nf_bridge; - nf_bridge_get(to->nf_bridge); -#endif -#endif skb_copy_secmark(to, from); } @@ -409,7 +430,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct iphdr *iph; int raw = 0; @@ -419,7 +440,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) unsigned int mtu, hlen, left, len, ll_rs, pad; int offset; __be16 not_last_frag; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); int err = 0; dev = rt->u.dst.dev; @@ -428,12 +449,12 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * Point into the IP datagram header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->u.dst))); + htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); return -EMSGSIZE; } @@ -456,6 +477,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) if (skb_shinfo(skb)->frag_list) { struct sk_buff *frag; int first_len = skb_pagelen(skb); + int truesizes = 0; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || @@ -479,7 +501,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) sock_hold(skb->sk); frag->sk = skb->sk; frag->destructor = sock_wfree; - skb->truesize -= frag->truesize; + truesizes += frag->truesize; } } @@ -490,6 +512,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) frag = skb_shinfo(skb)->frag_list; skb_shinfo(skb)->frag_list = NULL; skb->data_len = first_len - skb_headlen(skb); + skb->truesize -= truesizes; skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); @@ -500,11 +523,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * before previous one went down. */ if (frag) { frag->ip_summed = CHECKSUM_NONE; - frag->h.raw = frag->data; + skb_reset_transport_header(frag); __skb_push(frag, hlen); skb_reset_network_header(frag); - memcpy(frag->nh.raw, iph, hlen); - iph = frag->nh.iph; + memcpy(skb_network_header(frag), iph, hlen); + iph = ip_hdr(frag); iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); if (offset == 0) @@ -520,7 +543,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); if (!err) - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -530,7 +553,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } if (err == 0) { - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } @@ -539,7 +562,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) kfree_skb(frag); frag = skb; } - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -593,7 +616,7 @@ slow_path: skb_reserve(skb2, ll_rs); skb_put(skb2, len + hlen); skb_reset_network_header(skb2); - skb2->h.raw = skb2->data + hlen; + skb2->transport_header = skb2->network_header + hlen; /* * Charge the memory for the fragment to any owner @@ -607,19 +630,19 @@ slow_path: * Copy the packet header into the new buffer. */ - memcpy(skb2->nh.raw, skb->data, hlen); + skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) + if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) BUG(); left -= len; /* * Fill in the new header fields. */ - iph = skb2->nh.iph; + iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); /* ANK: dirty, but effective trick. Upgrade options only if @@ -651,15 +674,15 @@ slow_path: if (err) goto fail; - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -697,7 +720,7 @@ static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags) + int transhdrlen, int mtu, unsigned int flags) { struct sk_buff *skb; int err; @@ -718,34 +741,26 @@ static inline int ip_ufo_append_data(struct sock *sk, skb_reserve(skb, hh_len); /* create space for UDP/IP header */ - skb_put(skb,fragheaderlen + transhdrlen); + skb_put(skb, fragheaderlen + transhdrlen); /* initialize network header pointer */ skb_reset_network_header(skb); /* initialize protocol header pointer */ - skb->h.raw = skb->data + fragheaderlen; + skb->transport_header = skb->network_header + fragheaderlen; skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; sk->sk_sndmsg_off = 0; - } - err = skb_append_datato_frags(sk,skb, getfrag, from, - (length - transhdrlen)); - if (!err) { - /* specify the length of each IP datagram fragment*/ + /* specify the length of each IP datagram fragment */ skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; __skb_queue_tail(&sk->sk_write_queue, skb); - - return 0; } - /* There is not enough support do UFO , - * so follow normal path - */ - kfree_skb(skb); - return err; + + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); } /* @@ -763,7 +778,7 @@ int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable *rt, + struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -778,6 +793,7 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; + struct rtable *rt; if (flags&MSG_PROBE) return 0; @@ -797,9 +813,15 @@ int ip_append_data(struct sock *sk, inet->cork.flags |= IPCORK_OPT; inet->cork.addr = ipc->addr; } - dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); - inet->cork.rt = rt; + rt = *rtp; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; + inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->u.dst.dev->mtu : + dst_mtu(rt->u.dst.path); + inet->cork.dst = &rt->u.dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; @@ -808,7 +830,7 @@ int ip_append_data(struct sock *sk, transhdrlen += exthdrlen; } } else { - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -832,14 +854,14 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features & NETIF_F_ALL_CSUM && + rt->u.dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen) csummode = CHECKSUM_PARTIAL; inet->cork.length += length; - if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { - + if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -913,6 +935,10 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; + else + /* only the initial fragment is + time stamped */ + ipc->shtx.flags = 0; } if (skb == NULL) goto error; @@ -923,14 +949,16 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); + *skb_tx(skb) = ipc->shtx; /* * Find where to start putting bytes. */ data = skb_put(skb, fraglen); - skb->nh.raw = data + exthdrlen; + skb_set_network_header(skb, exthdrlen); + skb->transport_header = (skb->network_header + + fragheaderlen); data += fragheaderlen; - skb->h.raw = data + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -1007,8 +1035,6 @@ alloc_new_skb: skb_fill_page_desc(skb, i, page, 0, 0); frag = &skb_shinfo(skb)->frags[i]; - skb->truesize += PAGE_SIZE; - atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); } else { err = -EMSGSIZE; goto error; @@ -1021,6 +1047,8 @@ alloc_new_skb: frag->size += copy; skb->len += copy; skb->data_len += copy; + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); } offset += copy; length -= copy; @@ -1030,7 +1058,7 @@ alloc_new_skb: error: inet->cork.length -= length; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1056,7 +1084,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -1099,8 +1127,6 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, } if (len <= 0) { struct sk_buff *skb_prev; - char *data; - struct iphdr *iph; int alloclen; skb_prev = skb; @@ -1123,16 +1149,15 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, /* * Find where to start putting bytes. */ - data = skb_put(skb, fragheaderlen + fraggap); + skb_put(skb, fragheaderlen + fraggap); skb_reset_network_header(skb); - iph = skb->nh.iph; - data += fragheaderlen; - skb->h.raw = data; - + skb->transport_header = (skb->network_header + + fragheaderlen); if (fraggap) { - skb->csum = skb_copy_and_csum_bits( - skb_prev, maxfraglen, - data, fraggap, 0); + skb->csum = skb_copy_and_csum_bits(skb_prev, + maxfraglen, + skb_transport_header(skb), + fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); pskb_trim_unique(skb_prev, maxfraglen); @@ -1166,6 +1191,8 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, skb->len += len; skb->data_len += len; + skb->truesize += len; + atomic_add(len, &sk->sk_wmem_alloc); offset += len; size -= len; } @@ -1173,10 +1200,19 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, error: inet->cork.length -= size; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } +static void ip_cork_release(struct inet_sock *inet) +{ + inet->cork.flags &= ~IPCORK_OPT; + kfree(inet->cork.opt); + inet->cork.opt = NULL; + dst_release(inet->cork.dst); + inet->cork.dst = NULL; +} + /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. @@ -1186,8 +1222,9 @@ int ip_push_pending_frames(struct sock *sk) struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = inet->cork.rt; + struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; @@ -1198,10 +1235,10 @@ int ip_push_pending_frames(struct sock *sk) tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ - if (skb->data < skb->nh.raw) + if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; @@ -1216,13 +1253,13 @@ int ip_push_pending_frames(struct sock *sk) * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc != IP_PMTUDISC_DO) + if (inet->pmtudisc < IP_PMTUDISC_DO) skb->local_df = 1; /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame * locally. */ - if (inet->pmtudisc == IP_PMTUDISC_DO || + if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->u.dst) && ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); @@ -1243,21 +1280,28 @@ int ip_push_pending_frames(struct sock *sk) ip_options_build(skb, opt, inet->cork.addr, rt, 0); } iph->tos = inet->tos; - iph->tot_len = htons(skb->len); iph->frag_off = df; ip_select_ident(iph, &rt->u.dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; - ip_send_check(iph); skb->priority = sk->sk_priority; - skb->dst = dst_clone(&rt->u.dst); + skb->mark = sk->sk_mark; + /* + * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec + * on dst refcount + */ + inet->cork.dst = NULL; + skb_dst_set(skb, &rt->u.dst); + + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + err = ip_local_out(skb); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; @@ -1266,17 +1310,11 @@ int ip_push_pending_frames(struct sock *sk) } out: - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet); return err; error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); goto out; } @@ -1285,19 +1323,12 @@ error: */ void ip_flush_pending_frames(struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet_sk(sk)); } @@ -1320,8 +1351,6 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * * Should run single threaded per socket because it uses the sock * structure to pass arguments. - * - * LATER: switch from ip_build_xmit to ip_append_* */ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len) @@ -1333,13 +1362,14 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } replyopts; struct ipcm_cookie ipc; __be32 daddr; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); if (ip_options_echo(&replyopts.opt, skb)) return; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; + ipc.shtx.flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; @@ -1349,17 +1379,19 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } { - struct flowi fl = { .nl_u = { .ip4_u = + struct flowi fl = { .oif = arg->bound_dev_if, + .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, - .tos = RT_TOS(skb->nh.iph->tos) } }, + .tos = RT_TOS(ip_hdr(skb)->tos) } }, /* Not quite clean, but right. */ .uli_u = { .ports = - { .sport = skb->h.th->dest, - .dport = skb->h.th->source } }, - .proto = sk->sk_protocol }; + { .sport = tcp_hdr(skb)->dest, + .dport = tcp_hdr(skb)->source } }, + .proto = sk->sk_protocol, + .flags = ip_reply_arg_flowi_flags(arg) }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(sock_net(sk), &rt, &fl)) return; } @@ -1370,14 +1402,17 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar with locally disabled BH and that sk cannot be already spinlocked. */ bh_lock_sock(sk); - inet->tos = skb->nh.iph->tos; + inet->tos = ip_hdr(skb)->tos; sk->sk_priority = skb->priority; - sk->sk_protocol = skb->nh.iph->protocol; + sk->sk_protocol = ip_hdr(skb)->protocol; + sk->sk_bound_dev_if = arg->bound_dev_if; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, rt, MSG_DONTWAIT); + &ipc, &rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) - *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); + *((__sum16 *)skb_transport_header(skb) + + arg->csumoffset) = csum_fold(csum_add(skb->csum, + arg->csum)); skb->ip_summed = CHECKSUM_NONE; ip_push_pending_frames(sk); }