[SK_BUFF]: Introduce skb_copy_from_linear_data{_offset}
[safe/jmp/linux-2.6] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
37  *                                      for decreased register pressure on x86
38  *                                      and more readibility.
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/mm.h>
53 #include <linux/string.h>
54 #include <linux/errno.h>
55 #include <linux/highmem.h>
56
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/in.h>
60 #include <linux/inet.h>
61 #include <linux/netdevice.h>
62 #include <linux/etherdevice.h>
63 #include <linux/proc_fs.h>
64 #include <linux/stat.h>
65 #include <linux/init.h>
66
67 #include <net/snmp.h>
68 #include <net/ip.h>
69 #include <net/protocol.h>
70 #include <net/route.h>
71 #include <net/xfrm.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91         iph->check = 0;
92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98         skb_reset_mac_header(newskb);
99         __skb_pull(newskb, skb_network_offset(newskb));
100         newskb->pkt_type = PACKET_LOOPBACK;
101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
102         BUG_TRAP(newskb->dst);
103         netif_rx(newskb);
104         return 0;
105 }
106
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109         int ttl = inet->uc_ttl;
110
111         if (ttl < 0)
112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113         return ttl;
114 }
115
116 /*
117  *              Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
122 {
123         struct inet_sock *inet = inet_sk(sk);
124         struct rtable *rt = (struct rtable *)skb->dst;
125         struct iphdr *iph;
126
127         /* Build the IP header. */
128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
129         skb_reset_network_header(skb);
130         iph = ip_hdr(skb);
131         iph->version  = 4;
132         iph->ihl      = 5;
133         iph->tos      = inet->tos;
134         if (ip_dont_fragment(sk, &rt->u.dst))
135                 iph->frag_off = htons(IP_DF);
136         else
137                 iph->frag_off = 0;
138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
139         iph->daddr    = rt->rt_dst;
140         iph->saddr    = rt->rt_src;
141         iph->protocol = sk->sk_protocol;
142         iph->tot_len  = htons(skb->len);
143         ip_select_ident(iph, &rt->u.dst, sk);
144
145         if (opt && opt->optlen) {
146                 iph->ihl += opt->optlen>>2;
147                 ip_options_build(skb, opt, daddr, rt, 0);
148         }
149         ip_send_check(iph);
150
151         skb->priority = sk->sk_priority;
152
153         /* Send it out. */
154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
155                        dst_output);
156 }
157
158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
159
160 static inline int ip_finish_output2(struct sk_buff *skb)
161 {
162         struct dst_entry *dst = skb->dst;
163         struct net_device *dev = dst->dev;
164         int hh_len = LL_RESERVED_SPACE(dev);
165
166         /* Be paranoid, rather than too clever. */
167         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
168                 struct sk_buff *skb2;
169
170                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
171                 if (skb2 == NULL) {
172                         kfree_skb(skb);
173                         return -ENOMEM;
174                 }
175                 if (skb->sk)
176                         skb_set_owner_w(skb2, skb->sk);
177                 kfree_skb(skb);
178                 skb = skb2;
179         }
180
181         if (dst->hh)
182                 return neigh_hh_output(dst->hh, skb);
183         else if (dst->neighbour)
184                 return dst->neighbour->output(skb);
185
186         if (net_ratelimit())
187                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
188         kfree_skb(skb);
189         return -EINVAL;
190 }
191
192 static inline int ip_finish_output(struct sk_buff *skb)
193 {
194 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
195         /* Policy lookup after SNAT yielded a new policy */
196         if (skb->dst->xfrm != NULL) {
197                 IPCB(skb)->flags |= IPSKB_REROUTED;
198                 return dst_output(skb);
199         }
200 #endif
201         if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
202                 return ip_fragment(skb, ip_finish_output2);
203         else
204                 return ip_finish_output2(skb);
205 }
206
207 int ip_mc_output(struct sk_buff *skb)
208 {
209         struct sock *sk = skb->sk;
210         struct rtable *rt = (struct rtable*)skb->dst;
211         struct net_device *dev = rt->u.dst.dev;
212
213         /*
214          *      If the indicated interface is up and running, send the packet.
215          */
216         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
217
218         skb->dev = dev;
219         skb->protocol = htons(ETH_P_IP);
220
221         /*
222          *      Multicasts are looped back for other local users
223          */
224
225         if (rt->rt_flags&RTCF_MULTICAST) {
226                 if ((!sk || inet_sk(sk)->mc_loop)
227 #ifdef CONFIG_IP_MROUTE
228                 /* Small optimization: do not loopback not local frames,
229                    which returned after forwarding; they will be  dropped
230                    by ip_mr_input in any case.
231                    Note, that local frames are looped back to be delivered
232                    to local recipients.
233
234                    This check is duplicated in ip_mr_input at the moment.
235                  */
236                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
237 #endif
238                 ) {
239                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
240                         if (newskb)
241                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
242                                         newskb->dev,
243                                         ip_dev_loopback_xmit);
244                 }
245
246                 /* Multicasts with ttl 0 must not go beyond the host */
247
248                 if (ip_hdr(skb)->ttl == 0) {
249                         kfree_skb(skb);
250                         return 0;
251                 }
252         }
253
254         if (rt->rt_flags&RTCF_BROADCAST) {
255                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
256                 if (newskb)
257                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
258                                 newskb->dev, ip_dev_loopback_xmit);
259         }
260
261         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
262                             ip_finish_output,
263                             !(IPCB(skb)->flags & IPSKB_REROUTED));
264 }
265
266 int ip_output(struct sk_buff *skb)
267 {
268         struct net_device *dev = skb->dst->dev;
269
270         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
271
272         skb->dev = dev;
273         skb->protocol = htons(ETH_P_IP);
274
275         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
276                             ip_finish_output,
277                             !(IPCB(skb)->flags & IPSKB_REROUTED));
278 }
279
280 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
281 {
282         struct sock *sk = skb->sk;
283         struct inet_sock *inet = inet_sk(sk);
284         struct ip_options *opt = inet->opt;
285         struct rtable *rt;
286         struct iphdr *iph;
287
288         /* Skip all of this if the packet is already routed,
289          * f.e. by something like SCTP.
290          */
291         rt = (struct rtable *) skb->dst;
292         if (rt != NULL)
293                 goto packet_routed;
294
295         /* Make sure we can route this packet. */
296         rt = (struct rtable *)__sk_dst_check(sk, 0);
297         if (rt == NULL) {
298                 __be32 daddr;
299
300                 /* Use correct destination address if we have options. */
301                 daddr = inet->daddr;
302                 if(opt && opt->srr)
303                         daddr = opt->faddr;
304
305                 {
306                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
307                                             .nl_u = { .ip4_u =
308                                                       { .daddr = daddr,
309                                                         .saddr = inet->saddr,
310                                                         .tos = RT_CONN_FLAGS(sk) } },
311                                             .proto = sk->sk_protocol,
312                                             .uli_u = { .ports =
313                                                        { .sport = inet->sport,
314                                                          .dport = inet->dport } } };
315
316                         /* If this fails, retransmit mechanism of transport layer will
317                          * keep trying until route appears or the connection times
318                          * itself out.
319                          */
320                         security_sk_classify_flow(sk, &fl);
321                         if (ip_route_output_flow(&rt, &fl, sk, 0))
322                                 goto no_route;
323                 }
324                 sk_setup_caps(sk, &rt->u.dst);
325         }
326         skb->dst = dst_clone(&rt->u.dst);
327
328 packet_routed:
329         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
330                 goto no_route;
331
332         /* OK, we know where to send it, allocate and build IP header. */
333         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
334         skb_reset_network_header(skb);
335         iph = ip_hdr(skb);
336         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
337         iph->tot_len = htons(skb->len);
338         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
339                 iph->frag_off = htons(IP_DF);
340         else
341                 iph->frag_off = 0;
342         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
343         iph->protocol = sk->sk_protocol;
344         iph->saddr    = rt->rt_src;
345         iph->daddr    = rt->rt_dst;
346         /* Transport layer set skb->h.foo itself. */
347
348         if (opt && opt->optlen) {
349                 iph->ihl += opt->optlen >> 2;
350                 ip_options_build(skb, opt, inet->daddr, rt, 0);
351         }
352
353         ip_select_ident_more(iph, &rt->u.dst, sk,
354                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
355
356         /* Add an IP checksum. */
357         ip_send_check(iph);
358
359         skb->priority = sk->sk_priority;
360
361         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
362                        dst_output);
363
364 no_route:
365         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
366         kfree_skb(skb);
367         return -EHOSTUNREACH;
368 }
369
370
371 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
372 {
373         to->pkt_type = from->pkt_type;
374         to->priority = from->priority;
375         to->protocol = from->protocol;
376         dst_release(to->dst);
377         to->dst = dst_clone(from->dst);
378         to->dev = from->dev;
379         to->mark = from->mark;
380
381         /* Copy the flags to each fragment. */
382         IPCB(to)->flags = IPCB(from)->flags;
383
384 #ifdef CONFIG_NET_SCHED
385         to->tc_index = from->tc_index;
386 #endif
387         nf_copy(to, from);
388 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
389         to->ipvs_property = from->ipvs_property;
390 #endif
391         skb_copy_secmark(to, from);
392 }
393
394 /*
395  *      This IP datagram is too large to be sent in one piece.  Break it up into
396  *      smaller pieces (each of size equal to IP header plus
397  *      a block of the data of the original IP data part) that will yet fit in a
398  *      single device frame, and queue such a frame for sending.
399  */
400
401 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
402 {
403         struct iphdr *iph;
404         int raw = 0;
405         int ptr;
406         struct net_device *dev;
407         struct sk_buff *skb2;
408         unsigned int mtu, hlen, left, len, ll_rs, pad;
409         int offset;
410         __be16 not_last_frag;
411         struct rtable *rt = (struct rtable*)skb->dst;
412         int err = 0;
413
414         dev = rt->u.dst.dev;
415
416         /*
417          *      Point into the IP datagram header.
418          */
419
420         iph = ip_hdr(skb);
421
422         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
423                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
424                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
425                           htonl(dst_mtu(&rt->u.dst)));
426                 kfree_skb(skb);
427                 return -EMSGSIZE;
428         }
429
430         /*
431          *      Setup starting values.
432          */
433
434         hlen = iph->ihl * 4;
435         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
436         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
437
438         /* When frag_list is given, use it. First, check its validity:
439          * some transformers could create wrong frag_list or break existing
440          * one, it is not prohibited. In this case fall back to copying.
441          *
442          * LATER: this step can be merged to real generation of fragments,
443          * we can switch to copy when see the first bad fragment.
444          */
445         if (skb_shinfo(skb)->frag_list) {
446                 struct sk_buff *frag;
447                 int first_len = skb_pagelen(skb);
448
449                 if (first_len - hlen > mtu ||
450                     ((first_len - hlen) & 7) ||
451                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
452                     skb_cloned(skb))
453                         goto slow_path;
454
455                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
456                         /* Correct geometry. */
457                         if (frag->len > mtu ||
458                             ((frag->len & 7) && frag->next) ||
459                             skb_headroom(frag) < hlen)
460                             goto slow_path;
461
462                         /* Partially cloned skb? */
463                         if (skb_shared(frag))
464                                 goto slow_path;
465
466                         BUG_ON(frag->sk);
467                         if (skb->sk) {
468                                 sock_hold(skb->sk);
469                                 frag->sk = skb->sk;
470                                 frag->destructor = sock_wfree;
471                                 skb->truesize -= frag->truesize;
472                         }
473                 }
474
475                 /* Everything is OK. Generate! */
476
477                 err = 0;
478                 offset = 0;
479                 frag = skb_shinfo(skb)->frag_list;
480                 skb_shinfo(skb)->frag_list = NULL;
481                 skb->data_len = first_len - skb_headlen(skb);
482                 skb->len = first_len;
483                 iph->tot_len = htons(first_len);
484                 iph->frag_off = htons(IP_MF);
485                 ip_send_check(iph);
486
487                 for (;;) {
488                         /* Prepare header of the next frame,
489                          * before previous one went down. */
490                         if (frag) {
491                                 frag->ip_summed = CHECKSUM_NONE;
492                                 skb_reset_transport_header(frag);
493                                 __skb_push(frag, hlen);
494                                 skb_reset_network_header(frag);
495                                 memcpy(skb_network_header(frag), iph, hlen);
496                                 iph = ip_hdr(frag);
497                                 iph->tot_len = htons(frag->len);
498                                 ip_copy_metadata(frag, skb);
499                                 if (offset == 0)
500                                         ip_options_fragment(frag);
501                                 offset += skb->len - hlen;
502                                 iph->frag_off = htons(offset>>3);
503                                 if (frag->next != NULL)
504                                         iph->frag_off |= htons(IP_MF);
505                                 /* Ready, complete checksum */
506                                 ip_send_check(iph);
507                         }
508
509                         err = output(skb);
510
511                         if (!err)
512                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
513                         if (err || !frag)
514                                 break;
515
516                         skb = frag;
517                         frag = skb->next;
518                         skb->next = NULL;
519                 }
520
521                 if (err == 0) {
522                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
523                         return 0;
524                 }
525
526                 while (frag) {
527                         skb = frag->next;
528                         kfree_skb(frag);
529                         frag = skb;
530                 }
531                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
532                 return err;
533         }
534
535 slow_path:
536         left = skb->len - hlen;         /* Space per frame */
537         ptr = raw + hlen;               /* Where to start from */
538
539         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
540          * we need to make room for the encapsulating header
541          */
542         pad = nf_bridge_pad(skb);
543         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
544         mtu -= pad;
545
546         /*
547          *      Fragment the datagram.
548          */
549
550         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
551         not_last_frag = iph->frag_off & htons(IP_MF);
552
553         /*
554          *      Keep copying data until we run out.
555          */
556
557         while (left > 0) {
558                 len = left;
559                 /* IF: it doesn't fit, use 'mtu' - the data space left */
560                 if (len > mtu)
561                         len = mtu;
562                 /* IF: we are not sending upto and including the packet end
563                    then align the next start on an eight byte boundary */
564                 if (len < left) {
565                         len &= ~7;
566                 }
567                 /*
568                  *      Allocate buffer.
569                  */
570
571                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
572                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
573                         err = -ENOMEM;
574                         goto fail;
575                 }
576
577                 /*
578                  *      Set up data on packet
579                  */
580
581                 ip_copy_metadata(skb2, skb);
582                 skb_reserve(skb2, ll_rs);
583                 skb_put(skb2, len + hlen);
584                 skb_reset_network_header(skb2);
585                 skb2->transport_header = skb2->network_header + hlen;
586
587                 /*
588                  *      Charge the memory for the fragment to any owner
589                  *      it might possess
590                  */
591
592                 if (skb->sk)
593                         skb_set_owner_w(skb2, skb->sk);
594
595                 /*
596                  *      Copy the packet header into the new buffer.
597                  */
598
599                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
600
601                 /*
602                  *      Copy a block of the IP datagram.
603                  */
604                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
605                         BUG();
606                 left -= len;
607
608                 /*
609                  *      Fill in the new header fields.
610                  */
611                 iph = ip_hdr(skb2);
612                 iph->frag_off = htons((offset >> 3));
613
614                 /* ANK: dirty, but effective trick. Upgrade options only if
615                  * the segment to be fragmented was THE FIRST (otherwise,
616                  * options are already fixed) and make it ONCE
617                  * on the initial skb, so that all the following fragments
618                  * will inherit fixed options.
619                  */
620                 if (offset == 0)
621                         ip_options_fragment(skb);
622
623                 /*
624                  *      Added AC : If we are fragmenting a fragment that's not the
625                  *                 last fragment then keep MF on each bit
626                  */
627                 if (left > 0 || not_last_frag)
628                         iph->frag_off |= htons(IP_MF);
629                 ptr += len;
630                 offset += len;
631
632                 /*
633                  *      Put this fragment into the sending queue.
634                  */
635                 iph->tot_len = htons(len + hlen);
636
637                 ip_send_check(iph);
638
639                 err = output(skb2);
640                 if (err)
641                         goto fail;
642
643                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
644         }
645         kfree_skb(skb);
646         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
647         return err;
648
649 fail:
650         kfree_skb(skb);
651         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
652         return err;
653 }
654
655 EXPORT_SYMBOL(ip_fragment);
656
657 int
658 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
659 {
660         struct iovec *iov = from;
661
662         if (skb->ip_summed == CHECKSUM_PARTIAL) {
663                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
664                         return -EFAULT;
665         } else {
666                 __wsum csum = 0;
667                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
668                         return -EFAULT;
669                 skb->csum = csum_block_add(skb->csum, csum, odd);
670         }
671         return 0;
672 }
673
674 static inline __wsum
675 csum_page(struct page *page, int offset, int copy)
676 {
677         char *kaddr;
678         __wsum csum;
679         kaddr = kmap(page);
680         csum = csum_partial(kaddr + offset, copy, 0);
681         kunmap(page);
682         return csum;
683 }
684
685 static inline int ip_ufo_append_data(struct sock *sk,
686                         int getfrag(void *from, char *to, int offset, int len,
687                                int odd, struct sk_buff *skb),
688                         void *from, int length, int hh_len, int fragheaderlen,
689                         int transhdrlen, int mtu,unsigned int flags)
690 {
691         struct sk_buff *skb;
692         int err;
693
694         /* There is support for UDP fragmentation offload by network
695          * device, so create one single skb packet containing complete
696          * udp datagram
697          */
698         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
699                 skb = sock_alloc_send_skb(sk,
700                         hh_len + fragheaderlen + transhdrlen + 20,
701                         (flags & MSG_DONTWAIT), &err);
702
703                 if (skb == NULL)
704                         return err;
705
706                 /* reserve space for Hardware header */
707                 skb_reserve(skb, hh_len);
708
709                 /* create space for UDP/IP header */
710                 skb_put(skb,fragheaderlen + transhdrlen);
711
712                 /* initialize network header pointer */
713                 skb_reset_network_header(skb);
714
715                 /* initialize protocol header pointer */
716                 skb->transport_header = skb->network_header + fragheaderlen;
717
718                 skb->ip_summed = CHECKSUM_PARTIAL;
719                 skb->csum = 0;
720                 sk->sk_sndmsg_off = 0;
721         }
722
723         err = skb_append_datato_frags(sk,skb, getfrag, from,
724                                (length - transhdrlen));
725         if (!err) {
726                 /* specify the length of each IP datagram fragment*/
727                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
728                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
729                 __skb_queue_tail(&sk->sk_write_queue, skb);
730
731                 return 0;
732         }
733         /* There is not enough support do UFO ,
734          * so follow normal path
735          */
736         kfree_skb(skb);
737         return err;
738 }
739
740 /*
741  *      ip_append_data() and ip_append_page() can make one large IP datagram
742  *      from many pieces of data. Each pieces will be holded on the socket
743  *      until ip_push_pending_frames() is called. Each piece can be a page
744  *      or non-page data.
745  *
746  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
747  *      this interface potentially.
748  *
749  *      LATER: length must be adjusted by pad at tail, when it is required.
750  */
751 int ip_append_data(struct sock *sk,
752                    int getfrag(void *from, char *to, int offset, int len,
753                                int odd, struct sk_buff *skb),
754                    void *from, int length, int transhdrlen,
755                    struct ipcm_cookie *ipc, struct rtable *rt,
756                    unsigned int flags)
757 {
758         struct inet_sock *inet = inet_sk(sk);
759         struct sk_buff *skb;
760
761         struct ip_options *opt = NULL;
762         int hh_len;
763         int exthdrlen;
764         int mtu;
765         int copy;
766         int err;
767         int offset = 0;
768         unsigned int maxfraglen, fragheaderlen;
769         int csummode = CHECKSUM_NONE;
770
771         if (flags&MSG_PROBE)
772                 return 0;
773
774         if (skb_queue_empty(&sk->sk_write_queue)) {
775                 /*
776                  * setup for corking.
777                  */
778                 opt = ipc->opt;
779                 if (opt) {
780                         if (inet->cork.opt == NULL) {
781                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
782                                 if (unlikely(inet->cork.opt == NULL))
783                                         return -ENOBUFS;
784                         }
785                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
786                         inet->cork.flags |= IPCORK_OPT;
787                         inet->cork.addr = ipc->addr;
788                 }
789                 dst_hold(&rt->u.dst);
790                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
791                 inet->cork.rt = rt;
792                 inet->cork.length = 0;
793                 sk->sk_sndmsg_page = NULL;
794                 sk->sk_sndmsg_off = 0;
795                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
796                         length += exthdrlen;
797                         transhdrlen += exthdrlen;
798                 }
799         } else {
800                 rt = inet->cork.rt;
801                 if (inet->cork.flags & IPCORK_OPT)
802                         opt = inet->cork.opt;
803
804                 transhdrlen = 0;
805                 exthdrlen = 0;
806                 mtu = inet->cork.fragsize;
807         }
808         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
809
810         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
811         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
812
813         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
814                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
815                 return -EMSGSIZE;
816         }
817
818         /*
819          * transhdrlen > 0 means that this is the first fragment and we wish
820          * it won't be fragmented in the future.
821          */
822         if (transhdrlen &&
823             length + fragheaderlen <= mtu &&
824             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
825             !exthdrlen)
826                 csummode = CHECKSUM_PARTIAL;
827
828         inet->cork.length += length;
829         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
830                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
831
832                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
833                                          fragheaderlen, transhdrlen, mtu,
834                                          flags);
835                 if (err)
836                         goto error;
837                 return 0;
838         }
839
840         /* So, what's going on in the loop below?
841          *
842          * We use calculated fragment length to generate chained skb,
843          * each of segments is IP fragment ready for sending to network after
844          * adding appropriate IP header.
845          */
846
847         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
848                 goto alloc_new_skb;
849
850         while (length > 0) {
851                 /* Check if the remaining data fits into current packet. */
852                 copy = mtu - skb->len;
853                 if (copy < length)
854                         copy = maxfraglen - skb->len;
855                 if (copy <= 0) {
856                         char *data;
857                         unsigned int datalen;
858                         unsigned int fraglen;
859                         unsigned int fraggap;
860                         unsigned int alloclen;
861                         struct sk_buff *skb_prev;
862 alloc_new_skb:
863                         skb_prev = skb;
864                         if (skb_prev)
865                                 fraggap = skb_prev->len - maxfraglen;
866                         else
867                                 fraggap = 0;
868
869                         /*
870                          * If remaining data exceeds the mtu,
871                          * we know we need more fragment(s).
872                          */
873                         datalen = length + fraggap;
874                         if (datalen > mtu - fragheaderlen)
875                                 datalen = maxfraglen - fragheaderlen;
876                         fraglen = datalen + fragheaderlen;
877
878                         if ((flags & MSG_MORE) &&
879                             !(rt->u.dst.dev->features&NETIF_F_SG))
880                                 alloclen = mtu;
881                         else
882                                 alloclen = datalen + fragheaderlen;
883
884                         /* The last fragment gets additional space at tail.
885                          * Note, with MSG_MORE we overallocate on fragments,
886                          * because we have no idea what fragment will be
887                          * the last.
888                          */
889                         if (datalen == length + fraggap)
890                                 alloclen += rt->u.dst.trailer_len;
891
892                         if (transhdrlen) {
893                                 skb = sock_alloc_send_skb(sk,
894                                                 alloclen + hh_len + 15,
895                                                 (flags & MSG_DONTWAIT), &err);
896                         } else {
897                                 skb = NULL;
898                                 if (atomic_read(&sk->sk_wmem_alloc) <=
899                                     2 * sk->sk_sndbuf)
900                                         skb = sock_wmalloc(sk,
901                                                            alloclen + hh_len + 15, 1,
902                                                            sk->sk_allocation);
903                                 if (unlikely(skb == NULL))
904                                         err = -ENOBUFS;
905                         }
906                         if (skb == NULL)
907                                 goto error;
908
909                         /*
910                          *      Fill in the control structures
911                          */
912                         skb->ip_summed = csummode;
913                         skb->csum = 0;
914                         skb_reserve(skb, hh_len);
915
916                         /*
917                          *      Find where to start putting bytes.
918                          */
919                         data = skb_put(skb, fraglen);
920                         skb_set_network_header(skb, exthdrlen);
921                         skb->transport_header = (skb->network_header +
922                                                  fragheaderlen);
923                         data += fragheaderlen;
924
925                         if (fraggap) {
926                                 skb->csum = skb_copy_and_csum_bits(
927                                         skb_prev, maxfraglen,
928                                         data + transhdrlen, fraggap, 0);
929                                 skb_prev->csum = csum_sub(skb_prev->csum,
930                                                           skb->csum);
931                                 data += fraggap;
932                                 pskb_trim_unique(skb_prev, maxfraglen);
933                         }
934
935                         copy = datalen - transhdrlen - fraggap;
936                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
937                                 err = -EFAULT;
938                                 kfree_skb(skb);
939                                 goto error;
940                         }
941
942                         offset += copy;
943                         length -= datalen - fraggap;
944                         transhdrlen = 0;
945                         exthdrlen = 0;
946                         csummode = CHECKSUM_NONE;
947
948                         /*
949                          * Put the packet on the pending queue.
950                          */
951                         __skb_queue_tail(&sk->sk_write_queue, skb);
952                         continue;
953                 }
954
955                 if (copy > length)
956                         copy = length;
957
958                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
959                         unsigned int off;
960
961                         off = skb->len;
962                         if (getfrag(from, skb_put(skb, copy),
963                                         offset, copy, off, skb) < 0) {
964                                 __skb_trim(skb, off);
965                                 err = -EFAULT;
966                                 goto error;
967                         }
968                 } else {
969                         int i = skb_shinfo(skb)->nr_frags;
970                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
971                         struct page *page = sk->sk_sndmsg_page;
972                         int off = sk->sk_sndmsg_off;
973                         unsigned int left;
974
975                         if (page && (left = PAGE_SIZE - off) > 0) {
976                                 if (copy >= left)
977                                         copy = left;
978                                 if (page != frag->page) {
979                                         if (i == MAX_SKB_FRAGS) {
980                                                 err = -EMSGSIZE;
981                                                 goto error;
982                                         }
983                                         get_page(page);
984                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
985                                         frag = &skb_shinfo(skb)->frags[i];
986                                 }
987                         } else if (i < MAX_SKB_FRAGS) {
988                                 if (copy > PAGE_SIZE)
989                                         copy = PAGE_SIZE;
990                                 page = alloc_pages(sk->sk_allocation, 0);
991                                 if (page == NULL)  {
992                                         err = -ENOMEM;
993                                         goto error;
994                                 }
995                                 sk->sk_sndmsg_page = page;
996                                 sk->sk_sndmsg_off = 0;
997
998                                 skb_fill_page_desc(skb, i, page, 0, 0);
999                                 frag = &skb_shinfo(skb)->frags[i];
1000                                 skb->truesize += PAGE_SIZE;
1001                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1002                         } else {
1003                                 err = -EMSGSIZE;
1004                                 goto error;
1005                         }
1006                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1007                                 err = -EFAULT;
1008                                 goto error;
1009                         }
1010                         sk->sk_sndmsg_off += copy;
1011                         frag->size += copy;
1012                         skb->len += copy;
1013                         skb->data_len += copy;
1014                 }
1015                 offset += copy;
1016                 length -= copy;
1017         }
1018
1019         return 0;
1020
1021 error:
1022         inet->cork.length -= length;
1023         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1024         return err;
1025 }
1026
1027 ssize_t ip_append_page(struct sock *sk, struct page *page,
1028                        int offset, size_t size, int flags)
1029 {
1030         struct inet_sock *inet = inet_sk(sk);
1031         struct sk_buff *skb;
1032         struct rtable *rt;
1033         struct ip_options *opt = NULL;
1034         int hh_len;
1035         int mtu;
1036         int len;
1037         int err;
1038         unsigned int maxfraglen, fragheaderlen, fraggap;
1039
1040         if (inet->hdrincl)
1041                 return -EPERM;
1042
1043         if (flags&MSG_PROBE)
1044                 return 0;
1045
1046         if (skb_queue_empty(&sk->sk_write_queue))
1047                 return -EINVAL;
1048
1049         rt = inet->cork.rt;
1050         if (inet->cork.flags & IPCORK_OPT)
1051                 opt = inet->cork.opt;
1052
1053         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1054                 return -EOPNOTSUPP;
1055
1056         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1057         mtu = inet->cork.fragsize;
1058
1059         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1060         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1061
1062         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1063                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1064                 return -EMSGSIZE;
1065         }
1066
1067         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1068                 return -EINVAL;
1069
1070         inet->cork.length += size;
1071         if ((sk->sk_protocol == IPPROTO_UDP) &&
1072             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1073                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1074                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075         }
1076
1077
1078         while (size > 0) {
1079                 int i;
1080
1081                 if (skb_is_gso(skb))
1082                         len = size;
1083                 else {
1084
1085                         /* Check if the remaining data fits into current packet. */
1086                         len = mtu - skb->len;
1087                         if (len < size)
1088                                 len = maxfraglen - skb->len;
1089                 }
1090                 if (len <= 0) {
1091                         struct sk_buff *skb_prev;
1092                         int alloclen;
1093
1094                         skb_prev = skb;
1095                         fraggap = skb_prev->len - maxfraglen;
1096
1097                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1098                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1099                         if (unlikely(!skb)) {
1100                                 err = -ENOBUFS;
1101                                 goto error;
1102                         }
1103
1104                         /*
1105                          *      Fill in the control structures
1106                          */
1107                         skb->ip_summed = CHECKSUM_NONE;
1108                         skb->csum = 0;
1109                         skb_reserve(skb, hh_len);
1110
1111                         /*
1112                          *      Find where to start putting bytes.
1113                          */
1114                         skb_put(skb, fragheaderlen + fraggap);
1115                         skb_reset_network_header(skb);
1116                         skb->transport_header = (skb->network_header +
1117                                                  fragheaderlen);
1118                         if (fraggap) {
1119                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1120                                                                    maxfraglen,
1121                                                     skb_transport_header(skb),
1122                                                                    fraggap, 0);
1123                                 skb_prev->csum = csum_sub(skb_prev->csum,
1124                                                           skb->csum);
1125                                 pskb_trim_unique(skb_prev, maxfraglen);
1126                         }
1127
1128                         /*
1129                          * Put the packet on the pending queue.
1130                          */
1131                         __skb_queue_tail(&sk->sk_write_queue, skb);
1132                         continue;
1133                 }
1134
1135                 i = skb_shinfo(skb)->nr_frags;
1136                 if (len > size)
1137                         len = size;
1138                 if (skb_can_coalesce(skb, i, page, offset)) {
1139                         skb_shinfo(skb)->frags[i-1].size += len;
1140                 } else if (i < MAX_SKB_FRAGS) {
1141                         get_page(page);
1142                         skb_fill_page_desc(skb, i, page, offset, len);
1143                 } else {
1144                         err = -EMSGSIZE;
1145                         goto error;
1146                 }
1147
1148                 if (skb->ip_summed == CHECKSUM_NONE) {
1149                         __wsum csum;
1150                         csum = csum_page(page, offset, len);
1151                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1152                 }
1153
1154                 skb->len += len;
1155                 skb->data_len += len;
1156                 offset += len;
1157                 size -= len;
1158         }
1159         return 0;
1160
1161 error:
1162         inet->cork.length -= size;
1163         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1164         return err;
1165 }
1166
1167 /*
1168  *      Combined all pending IP fragments on the socket as one IP datagram
1169  *      and push them out.
1170  */
1171 int ip_push_pending_frames(struct sock *sk)
1172 {
1173         struct sk_buff *skb, *tmp_skb;
1174         struct sk_buff **tail_skb;
1175         struct inet_sock *inet = inet_sk(sk);
1176         struct ip_options *opt = NULL;
1177         struct rtable *rt = inet->cork.rt;
1178         struct iphdr *iph;
1179         __be16 df = 0;
1180         __u8 ttl;
1181         int err = 0;
1182
1183         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1184                 goto out;
1185         tail_skb = &(skb_shinfo(skb)->frag_list);
1186
1187         /* move skb->data to ip header from ext header */
1188         if (skb->data < skb_network_header(skb))
1189                 __skb_pull(skb, skb_network_offset(skb));
1190         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1191                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1192                 *tail_skb = tmp_skb;
1193                 tail_skb = &(tmp_skb->next);
1194                 skb->len += tmp_skb->len;
1195                 skb->data_len += tmp_skb->len;
1196                 skb->truesize += tmp_skb->truesize;
1197                 __sock_put(tmp_skb->sk);
1198                 tmp_skb->destructor = NULL;
1199                 tmp_skb->sk = NULL;
1200         }
1201
1202         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1203          * to fragment the frame generated here. No matter, what transforms
1204          * how transforms change size of the packet, it will come out.
1205          */
1206         if (inet->pmtudisc != IP_PMTUDISC_DO)
1207                 skb->local_df = 1;
1208
1209         /* DF bit is set when we want to see DF on outgoing frames.
1210          * If local_df is set too, we still allow to fragment this frame
1211          * locally. */
1212         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1213             (skb->len <= dst_mtu(&rt->u.dst) &&
1214              ip_dont_fragment(sk, &rt->u.dst)))
1215                 df = htons(IP_DF);
1216
1217         if (inet->cork.flags & IPCORK_OPT)
1218                 opt = inet->cork.opt;
1219
1220         if (rt->rt_type == RTN_MULTICAST)
1221                 ttl = inet->mc_ttl;
1222         else
1223                 ttl = ip_select_ttl(inet, &rt->u.dst);
1224
1225         iph = (struct iphdr *)skb->data;
1226         iph->version = 4;
1227         iph->ihl = 5;
1228         if (opt) {
1229                 iph->ihl += opt->optlen>>2;
1230                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1231         }
1232         iph->tos = inet->tos;
1233         iph->tot_len = htons(skb->len);
1234         iph->frag_off = df;
1235         ip_select_ident(iph, &rt->u.dst, sk);
1236         iph->ttl = ttl;
1237         iph->protocol = sk->sk_protocol;
1238         iph->saddr = rt->rt_src;
1239         iph->daddr = rt->rt_dst;
1240         ip_send_check(iph);
1241
1242         skb->priority = sk->sk_priority;
1243         skb->dst = dst_clone(&rt->u.dst);
1244
1245         /* Netfilter gets whole the not fragmented skb. */
1246         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1247                       skb->dst->dev, dst_output);
1248         if (err) {
1249                 if (err > 0)
1250                         err = inet->recverr ? net_xmit_errno(err) : 0;
1251                 if (err)
1252                         goto error;
1253         }
1254
1255 out:
1256         inet->cork.flags &= ~IPCORK_OPT;
1257         kfree(inet->cork.opt);
1258         inet->cork.opt = NULL;
1259         if (inet->cork.rt) {
1260                 ip_rt_put(inet->cork.rt);
1261                 inet->cork.rt = NULL;
1262         }
1263         return err;
1264
1265 error:
1266         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1267         goto out;
1268 }
1269
1270 /*
1271  *      Throw away all pending data on the socket.
1272  */
1273 void ip_flush_pending_frames(struct sock *sk)
1274 {
1275         struct inet_sock *inet = inet_sk(sk);
1276         struct sk_buff *skb;
1277
1278         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1279                 kfree_skb(skb);
1280
1281         inet->cork.flags &= ~IPCORK_OPT;
1282         kfree(inet->cork.opt);
1283         inet->cork.opt = NULL;
1284         if (inet->cork.rt) {
1285                 ip_rt_put(inet->cork.rt);
1286                 inet->cork.rt = NULL;
1287         }
1288 }
1289
1290
1291 /*
1292  *      Fetch data from kernel space and fill in checksum if needed.
1293  */
1294 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1295                               int len, int odd, struct sk_buff *skb)
1296 {
1297         __wsum csum;
1298
1299         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1300         skb->csum = csum_block_add(skb->csum, csum, odd);
1301         return 0;
1302 }
1303
1304 /*
1305  *      Generic function to send a packet as reply to another packet.
1306  *      Used to send TCP resets so far. ICMP should use this function too.
1307  *
1308  *      Should run single threaded per socket because it uses the sock
1309  *      structure to pass arguments.
1310  *
1311  *      LATER: switch from ip_build_xmit to ip_append_*
1312  */
1313 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1314                    unsigned int len)
1315 {
1316         struct inet_sock *inet = inet_sk(sk);
1317         struct {
1318                 struct ip_options       opt;
1319                 char                    data[40];
1320         } replyopts;
1321         struct ipcm_cookie ipc;
1322         __be32 daddr;
1323         struct rtable *rt = (struct rtable*)skb->dst;
1324
1325         if (ip_options_echo(&replyopts.opt, skb))
1326                 return;
1327
1328         daddr = ipc.addr = rt->rt_src;
1329         ipc.opt = NULL;
1330
1331         if (replyopts.opt.optlen) {
1332                 ipc.opt = &replyopts.opt;
1333
1334                 if (ipc.opt->srr)
1335                         daddr = replyopts.opt.faddr;
1336         }
1337
1338         {
1339                 struct flowi fl = { .nl_u = { .ip4_u =
1340                                               { .daddr = daddr,
1341                                                 .saddr = rt->rt_spec_dst,
1342                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1343                                     /* Not quite clean, but right. */
1344                                     .uli_u = { .ports =
1345                                                { .sport = tcp_hdr(skb)->dest,
1346                                                  .dport = tcp_hdr(skb)->source } },
1347                                     .proto = sk->sk_protocol };
1348                 security_skb_classify_flow(skb, &fl);
1349                 if (ip_route_output_key(&rt, &fl))
1350                         return;
1351         }
1352
1353         /* And let IP do all the hard work.
1354
1355            This chunk is not reenterable, hence spinlock.
1356            Note that it uses the fact, that this function is called
1357            with locally disabled BH and that sk cannot be already spinlocked.
1358          */
1359         bh_lock_sock(sk);
1360         inet->tos = ip_hdr(skb)->tos;
1361         sk->sk_priority = skb->priority;
1362         sk->sk_protocol = ip_hdr(skb)->protocol;
1363         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1364                        &ipc, rt, MSG_DONTWAIT);
1365         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1366                 if (arg->csumoffset >= 0)
1367                         *((__sum16 *)skb_transport_header(skb) +
1368                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1369                                                                 arg->csum));
1370                 skb->ip_summed = CHECKSUM_NONE;
1371                 ip_push_pending_frames(sk);
1372         }
1373
1374         bh_unlock_sock(sk);
1375
1376         ip_rt_put(rt);
1377 }
1378
1379 void __init ip_init(void)
1380 {
1381         ip_rt_init();
1382         inet_initpeers();
1383
1384 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1385         igmp_mc_proc_init();
1386 #endif
1387 }
1388
1389 EXPORT_SYMBOL(ip_generic_getfrag);
1390 EXPORT_SYMBOL(ip_queue_xmit);
1391 EXPORT_SYMBOL(ip_send_check);