X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Fip_output.c;h=3451799e3dbf78253d70995d37697a1b546f143d;hb=1b644b6e6f6160ae35ce4b52c2ca89ed3e356e18;hp=6d78e1d6b785ea4c032fdb81bc7e02f014869239;hpb=f1b050bf7a88910f9f00c9c8989c1bf5a67dd140;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6d78e1d..3451799 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, @@ -97,7 +95,7 @@ int __ip_local_out(struct sk_buff *skb) iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, + return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, dst_output); } @@ -120,7 +118,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_TRAP(newskb->dst); + WARN_ON(!skb_dst(newskb)); netif_rx(newskb); return 0; } @@ -142,7 +140,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options *opt) { struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb_rtable(skb); struct iphdr *iph; /* Build the IP header. */ @@ -168,6 +166,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, } skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; /* Send it out. */ return ip_local_out(skb); @@ -177,15 +176,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); - if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); - else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + if (rt->rt_type == RTN_MULTICAST) { + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + } else if (rt->rt_type == RTN_BROADCAST) + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -218,14 +217,14 @@ static inline int ip_skb_dst_mtu(struct sk_buff *skb) struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? - skb->dst->dev->mtu : dst_mtu(skb->dst); + skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); } static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ - if (skb->dst->xfrm != NULL) { + if (skb_dst(skb)->xfrm != NULL) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } @@ -239,13 +238,13 @@ static int ip_finish_output(struct sk_buff *skb) int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->u.dst.dev; /* * If the indicated interface is up and running, send the packet. */ - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -255,7 +254,7 @@ int ip_mc_output(struct sk_buff *skb) */ if (rt->rt_flags&RTCF_MULTICAST) { - if ((!sk || inet_sk(sk)->mc_loop) + if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped @@ -265,9 +264,11 @@ int ip_mc_output(struct sk_buff *skb) This check is duplicated in ip_mr_input at the moment. */ - && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) + && + ((rt->rt_flags & RTCF_LOCAL) || + !(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif - ) { + ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, @@ -297,9 +298,9 @@ int ip_mc_output(struct sk_buff *skb) int ip_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; + struct net_device *dev = skb_dst(skb)->dev; - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -320,7 +321,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ - rt = (struct rtable *) skb->dst; + rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; @@ -330,32 +331,34 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) __be32 daddr; /* Use correct destination address if we have options. */ - daddr = inet->daddr; + daddr = inet->inet_daddr; if(opt && opt->srr) daddr = opt->faddr; { struct flowi fl = { .oif = sk->sk_bound_dev_if, + .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = inet->saddr, + .saddr = inet->inet_saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; + { .sport = inet->inet_sport, + .dport = inet->inet_dport } } }; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) + if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } sk_setup_caps(sk, &rt->u.dst); } - skb->dst = dst_clone(&rt->u.dst); + skb_dst_set(skb, dst_clone(&rt->u.dst)); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -378,18 +381,19 @@ packet_routed: if (opt && opt->optlen) { iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, inet->daddr, rt, 0); + ip_options_build(skb, opt, inet->inet_daddr, rt, 0); } ip_select_ident_more(iph, &rt->u.dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; return ip_local_out(skb); no_route: - IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -400,8 +404,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - dst_release(to->dst); - to->dst = dst_clone(from->dst); + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); to->dev = from->dev; to->mark = from->mark; @@ -429,7 +433,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct iphdr *iph; int raw = 0; @@ -439,7 +443,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) unsigned int mtu, hlen, left, len, ll_rs, pad; int offset; __be16 not_last_frag; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); int err = 0; dev = rt->u.dst.dev; @@ -451,7 +455,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); @@ -473,9 +477,10 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_shinfo(skb)->frag_list) { + if (skb_has_frags(skb)) { struct sk_buff *frag; int first_len = skb_pagelen(skb); + int truesizes = 0; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || @@ -483,7 +488,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) skb_cloned(skb)) goto slow_path; - for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || @@ -496,11 +501,10 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) BUG_ON(frag->sk); if (skb->sk) { - sock_hold(skb->sk); frag->sk = skb->sk; frag->destructor = sock_wfree; - skb->truesize -= frag->truesize; } + truesizes += frag->truesize; } /* Everything is OK. Generate! */ @@ -508,8 +512,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = 0; offset = 0; frag = skb_shinfo(skb)->frag_list; - skb_shinfo(skb)->frag_list = NULL; + skb_frag_list_init(skb); skb->data_len = first_len - skb_headlen(skb); + skb->truesize -= truesizes; skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); @@ -540,7 +545,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); if (!err) - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -550,7 +555,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } if (err == 0) { - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } @@ -559,7 +564,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) kfree_skb(frag); frag = skb; } - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -671,15 +676,15 @@ slow_path: if (err) goto fail; - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -717,7 +722,7 @@ static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags) + int transhdrlen, int mtu, unsigned int flags) { struct sk_buff *skb; int err; @@ -738,7 +743,7 @@ static inline int ip_ufo_append_data(struct sock *sk, skb_reserve(skb, hh_len); /* create space for UDP/IP header */ - skb_put(skb,fragheaderlen + transhdrlen); + skb_put(skb, fragheaderlen + transhdrlen); /* initialize network header pointer */ skb_reset_network_header(skb); @@ -749,23 +754,15 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; sk->sk_sndmsg_off = 0; - } - err = skb_append_datato_frags(sk,skb, getfrag, from, - (length - transhdrlen)); - if (!err) { - /* specify the length of each IP datagram fragment*/ + /* specify the length of each IP datagram fragment */ skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; __skb_queue_tail(&sk->sk_write_queue, skb); - - return 0; } - /* There is not enough support do UFO , - * so follow normal path - */ - kfree_skb(skb); - return err; + + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); } /* @@ -783,7 +780,7 @@ int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable *rt, + struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -798,6 +795,7 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; + struct rtable *rt; if (flags&MSG_PROBE) return 0; @@ -817,11 +815,17 @@ int ip_append_data(struct sock *sk, inet->cork.flags |= IPCORK_OPT; inet->cork.addr = ipc->addr; } - dst_hold(&rt->u.dst); + rt = *rtp; + if (unlikely(!rt)) + return -EFAULT; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); - inet->cork.rt = rt; + inet->cork.dst = &rt->u.dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; @@ -830,7 +834,7 @@ int ip_append_data(struct sock *sk, transhdrlen += exthdrlen; } } else { - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -844,7 +848,8 @@ int ip_append_data(struct sock *sk, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, + mtu-exthdrlen); return -EMSGSIZE; } @@ -859,9 +864,9 @@ int ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; inet->cork.length += length; - if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { - + if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -935,6 +940,10 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; + else + /* only the initial fragment is + time stamped */ + ipc->shtx.flags = 0; } if (skb == NULL) goto error; @@ -945,6 +954,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); + *skb_tx(skb) = ipc->shtx; /* * Find where to start putting bytes. @@ -1053,7 +1063,7 @@ alloc_new_skb: error: inet->cork.length -= length; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1079,7 +1089,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -1093,7 +1103,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); return -EMSGSIZE; } @@ -1195,7 +1205,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, error: inet->cork.length -= size; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1204,10 +1214,8 @@ static void ip_cork_release(struct inet_sock *inet) inet->cork.flags &= ~IPCORK_OPT; kfree(inet->cork.opt); inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + dst_release(inet->cork.dst); + inet->cork.dst = NULL; } /* @@ -1219,8 +1227,9 @@ int ip_push_pending_frames(struct sock *sk) struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = inet->cork.rt; + struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; @@ -1240,7 +1249,6 @@ int ip_push_pending_frames(struct sock *sk) skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; - __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } @@ -1284,17 +1292,23 @@ int ip_push_pending_frames(struct sock *sk) iph->daddr = rt->rt_dst; skb->priority = sk->sk_priority; - skb->dst = dst_clone(&rt->u.dst); + skb->mark = sk->sk_mark; + /* + * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec + * on dst refcount + */ + inet->cork.dst = NULL; + skb_dst_set(skb, &rt->u.dst); if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ err = ip_local_out(skb); if (err) { if (err > 0) - err = inet->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) goto error; } @@ -1304,7 +1318,7 @@ out: return err; error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); goto out; } @@ -1352,13 +1366,14 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } replyopts; struct ipcm_cookie ipc; __be32 daddr; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); if (ip_options_echo(&replyopts.opt, skb)) return; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; + ipc.shtx.flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; @@ -1377,9 +1392,10 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar .uli_u = { .ports = { .sport = tcp_hdr(skb)->dest, .dport = tcp_hdr(skb)->source } }, - .proto = sk->sk_protocol }; + .proto = sk->sk_protocol, + .flags = ip_reply_arg_flowi_flags(arg) }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(sock_net(sk), &rt, &fl)) return; } @@ -1395,7 +1411,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, rt, MSG_DONTWAIT); + &ipc, &rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) *((__sum16 *)skb_transport_header(skb) +