X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Fip_output.c;h=8ebe86dd72af159e2250bc315251cf61561cc38a;hb=76a67ec6fb79ff3570dcb5342142c16098299911;hp=699f06781fd891df9882d1c77c0c335f8d7a7252;hpb=3b04ddde02cf1b6f14f2697da5c20eca5715017f;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 699f067..8ebe86d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, @@ -91,6 +89,28 @@ __inline__ void ip_send_check(struct iphdr *iph) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +int __ip_local_out(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->tot_len = htons(skb->len); + ip_send_check(iph); + return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, + dst_output); +} + +int ip_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip_local_out); + /* dev_loopback_xmit for use with netfilter. */ static int ip_dev_loopback_xmit(struct sk_buff *newskb) { @@ -98,7 +118,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_TRAP(newskb->dst); + WARN_ON(!newskb->dst); netif_rx(newskb); return 0; } @@ -120,7 +140,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options *opt) { struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; struct iphdr *iph; /* Build the IP header. */ @@ -138,20 +158,18 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->sk_protocol; - iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, sk); if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, daddr, rt, 0); } - ip_send_check(iph); skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; /* Send it out. */ - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); @@ -161,12 +179,12 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct dst_entry *dst = skb->dst; struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; - int hh_len = LL_RESERVED_SPACE(dev); + unsigned int hh_len = LL_RESERVED_SPACE(dev); if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTBCASTPKTS); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -202,7 +220,7 @@ static inline int ip_skb_dst_mtu(struct sk_buff *skb) skb->dst->dev->mtu : dst_mtu(skb->dst); } -static inline int ip_finish_output(struct sk_buff *skb) +static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -220,13 +238,13 @@ static inline int ip_finish_output(struct sk_buff *skb) int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; struct net_device *dev = rt->u.dst.dev; /* * If the indicated interface is up and running, send the packet. */ - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -251,8 +269,8 @@ int ip_mc_output(struct sk_buff *skb) ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, - newskb->dev, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, ip_dev_loopback_xmit); } @@ -267,11 +285,11 @@ int ip_mc_output(struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, ip_dev_loopback_xmit); } - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -280,12 +298,12 @@ int ip_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -301,7 +319,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ - rt = (struct rtable *) skb->dst; + rt = skb->rtable; if (rt != NULL) goto packet_routed; @@ -322,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) .saddr = inet->saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet->sport, .dport = inet->dport } } }; @@ -331,7 +350,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) * itself out. */ security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) + if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } sk_setup_caps(sk, &rt->u.dst); @@ -347,7 +366,6 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) iph->frag_off = htons(IP_DF); else @@ -366,16 +384,13 @@ packet_routed: ip_select_ident_more(iph, &rt->u.dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); - /* Add an IP checksum. */ - ip_send_check(iph); - skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); no_route: - IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -415,7 +430,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct iphdr *iph; int raw = 0; @@ -425,7 +440,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) unsigned int mtu, hlen, left, len, ll_rs, pad; int offset; __be16 not_last_frag; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; int err = 0; dev = rt->u.dst.dev; @@ -437,7 +452,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); @@ -462,6 +477,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) if (skb_shinfo(skb)->frag_list) { struct sk_buff *frag; int first_len = skb_pagelen(skb); + int truesizes = 0; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || @@ -485,7 +501,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) sock_hold(skb->sk); frag->sk = skb->sk; frag->destructor = sock_wfree; - skb->truesize -= frag->truesize; + truesizes += frag->truesize; } } @@ -496,6 +512,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) frag = skb_shinfo(skb)->frag_list; skb_shinfo(skb)->frag_list = NULL; skb->data_len = first_len - skb_headlen(skb); + skb->truesize -= truesizes; skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); @@ -526,7 +543,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); if (!err) - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -536,7 +553,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } if (err == 0) { - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } @@ -545,7 +562,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) kfree_skb(frag); frag = skb; } - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -657,15 +674,15 @@ slow_path: if (err) goto fail; - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -703,7 +720,7 @@ static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags) + int transhdrlen, int mtu, unsigned int flags) { struct sk_buff *skb; int err; @@ -724,7 +741,7 @@ static inline int ip_ufo_append_data(struct sock *sk, skb_reserve(skb, hh_len); /* create space for UDP/IP header */ - skb_put(skb,fragheaderlen + transhdrlen); + skb_put(skb, fragheaderlen + transhdrlen); /* initialize network header pointer */ skb_reset_network_header(skb); @@ -735,23 +752,15 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; sk->sk_sndmsg_off = 0; - } - err = skb_append_datato_frags(sk,skb, getfrag, from, - (length - transhdrlen)); - if (!err) { - /* specify the length of each IP datagram fragment*/ + /* specify the length of each IP datagram fragment */ skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; __skb_queue_tail(&sk->sk_write_queue, skb); - - return 0; } - /* There is not enough support do UFO , - * so follow normal path - */ - kfree_skb(skb); - return err; + + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); } /* @@ -769,7 +778,7 @@ int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable *rt, + struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -784,6 +793,7 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; + struct rtable *rt; if (flags&MSG_PROBE) return 0; @@ -803,11 +813,15 @@ int ip_append_data(struct sock *sk, inet->cork.flags |= IPCORK_OPT; inet->cork.addr = ipc->addr; } - dst_hold(&rt->u.dst); + rt = *rtp; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); - inet->cork.rt = rt; + inet->cork.dst = &rt->u.dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; @@ -816,7 +830,7 @@ int ip_append_data(struct sock *sk, transhdrlen += exthdrlen; } } else { - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -845,9 +859,9 @@ int ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; inet->cork.length += length; - if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { - + if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -1016,8 +1030,6 @@ alloc_new_skb: skb_fill_page_desc(skb, i, page, 0, 0); frag = &skb_shinfo(skb)->frags[i]; - skb->truesize += PAGE_SIZE; - atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); } else { err = -EMSGSIZE; goto error; @@ -1030,6 +1042,8 @@ alloc_new_skb: frag->size += copy; skb->len += copy; skb->data_len += copy; + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); } offset += copy; length -= copy; @@ -1039,7 +1053,7 @@ alloc_new_skb: error: inet->cork.length -= length; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1065,7 +1079,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -1172,6 +1186,8 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, skb->len += len; skb->data_len += len; + skb->truesize += len; + atomic_add(len, &sk->sk_wmem_alloc); offset += len; size -= len; } @@ -1179,10 +1195,19 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, error: inet->cork.length -= size; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } +static void ip_cork_release(struct inet_sock *inet) +{ + inet->cork.flags &= ~IPCORK_OPT; + kfree(inet->cork.opt); + inet->cork.opt = NULL; + dst_release(inet->cork.dst); + inet->cork.dst = NULL; +} + /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. @@ -1192,8 +1217,9 @@ int ip_push_pending_frames(struct sock *sk) struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = inet->cork.rt; + struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; @@ -1249,25 +1275,28 @@ int ip_push_pending_frames(struct sock *sk) ip_options_build(skb, opt, inet->cork.addr, rt, 0); } iph->tos = inet->tos; - iph->tot_len = htons(skb->len); iph->frag_off = df; ip_select_ident(iph, &rt->u.dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; - ip_send_check(iph); skb->priority = sk->sk_priority; - skb->dst = dst_clone(&rt->u.dst); + skb->mark = sk->sk_mark; + /* + * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec + * on dst refcount + */ + inet->cork.dst = NULL; + skb->dst = &rt->u.dst; if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + err = ip_local_out(skb); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; @@ -1276,17 +1305,11 @@ int ip_push_pending_frames(struct sock *sk) } out: - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet); return err; error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); goto out; } @@ -1295,19 +1318,12 @@ error: */ void ip_flush_pending_frames(struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet_sk(sk)); } @@ -1330,8 +1346,6 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * * Should run single threaded per socket because it uses the sock * structure to pass arguments. - * - * LATER: switch from ip_build_xmit to ip_append_* */ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len) @@ -1343,7 +1357,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } replyopts; struct ipcm_cookie ipc; __be32 daddr; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; if (ip_options_echo(&replyopts.opt, skb)) return; @@ -1368,9 +1382,10 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar .uli_u = { .ports = { .sport = tcp_hdr(skb)->dest, .dport = tcp_hdr(skb)->source } }, - .proto = sk->sk_protocol }; + .proto = sk->sk_protocol, + .flags = ip_reply_arg_flowi_flags(arg) }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(sock_net(sk), &rt, &fl)) return; } @@ -1386,7 +1401,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, rt, MSG_DONTWAIT); + &ipc, &rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) *((__sum16 *)skb_transport_header(skb) +