[NETNS][IPV6] route6 - add netns parameter to ip6_route_output
[safe/jmp/linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/errno.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/socket.h>
35 #include <linux/net.h>
36 #include <linux/netdevice.h>
37 #include <linux/if_arp.h>
38 #include <linux/in6.h>
39 #include <linux/tcp.h>
40 #include <linux/route.h>
41 #include <linux/module.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 int __ip6_local_out(struct sk_buff *skb)
74 {
75         int len;
76
77         len = skb->len - sizeof(struct ipv6hdr);
78         if (len > IPV6_MAXPLEN)
79                 len = 0;
80         ipv6_hdr(skb)->payload_len = htons(len);
81
82         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
83                        dst_output);
84 }
85
86 int ip6_local_out(struct sk_buff *skb)
87 {
88         int err;
89
90         err = __ip6_local_out(skb);
91         if (likely(err == 1))
92                 err = dst_output(skb);
93
94         return err;
95 }
96 EXPORT_SYMBOL_GPL(ip6_local_out);
97
98 static int ip6_output_finish(struct sk_buff *skb)
99 {
100         struct dst_entry *dst = skb->dst;
101
102         if (dst->hh)
103                 return neigh_hh_output(dst->hh, skb);
104         else if (dst->neighbour)
105                 return dst->neighbour->output(skb);
106
107         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
108         kfree_skb(skb);
109         return -EINVAL;
110
111 }
112
113 /* dev_loopback_xmit for use with netfilter. */
114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115 {
116         skb_reset_mac_header(newskb);
117         __skb_pull(newskb, skb_network_offset(newskb));
118         newskb->pkt_type = PACKET_LOOPBACK;
119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
120         BUG_TRAP(newskb->dst);
121
122         netif_rx(newskb);
123         return 0;
124 }
125
126
127 static int ip6_output2(struct sk_buff *skb)
128 {
129         struct dst_entry *dst = skb->dst;
130         struct net_device *dev = dst->dev;
131
132         skb->protocol = htons(ETH_P_IPV6);
133         skb->dev = dev;
134
135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
138
139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
140                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141                                         &ipv6_hdr(skb)->saddr)) {
142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143
144                         /* Do not check for IFF_ALLMULTI; multicast routing
145                            is not supported in any case.
146                          */
147                         if (newskb)
148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149                                         NULL, newskb->dev,
150                                         ip6_dev_loopback_xmit);
151
152                         if (ipv6_hdr(skb)->hop_limit == 0) {
153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
154                                 kfree_skb(skb);
155                                 return 0;
156                         }
157                 }
158
159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
160         }
161
162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163                        ip6_output_finish);
164 }
165
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167 {
168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169
170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171                skb->dst->dev->mtu : dst_mtu(skb->dst);
172 }
173
174 int ip6_output(struct sk_buff *skb)
175 {
176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177                                 dst_allfrag(skb->dst))
178                 return ip6_fragment(skb, ip6_output2);
179         else
180                 return ip6_output2(skb);
181 }
182
183 /*
184  *      xmit an sk_buff (used by TCP)
185  */
186
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188              struct ipv6_txoptions *opt, int ipfragok)
189 {
190         struct ipv6_pinfo *np = inet6_sk(sk);
191         struct in6_addr *first_hop = &fl->fl6_dst;
192         struct dst_entry *dst = skb->dst;
193         struct ipv6hdr *hdr;
194         u8  proto = fl->proto;
195         int seg_len = skb->len;
196         int hlimit, tclass;
197         u32 mtu;
198
199         if (opt) {
200                 unsigned int head_room;
201
202                 /* First: exthdrs may take lots of space (~8K for now)
203                    MAX_HEADER is not enough.
204                  */
205                 head_room = opt->opt_nflen + opt->opt_flen;
206                 seg_len += head_room;
207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208
209                 if (skb_headroom(skb) < head_room) {
210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211                         if (skb2 == NULL) {
212                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
213                                               IPSTATS_MIB_OUTDISCARDS);
214                                 kfree_skb(skb);
215                                 return -ENOBUFS;
216                         }
217                         kfree_skb(skb);
218                         skb = skb2;
219                         if (sk)
220                                 skb_set_owner_w(skb, sk);
221                 }
222                 if (opt->opt_flen)
223                         ipv6_push_frag_opts(skb, opt, &proto);
224                 if (opt->opt_nflen)
225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
226         }
227
228         skb_push(skb, sizeof(struct ipv6hdr));
229         skb_reset_network_header(skb);
230         hdr = ipv6_hdr(skb);
231
232         /*
233          *      Fill in the IPv6 header
234          */
235
236         hlimit = -1;
237         if (np)
238                 hlimit = np->hop_limit;
239         if (hlimit < 0)
240                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
241         if (hlimit < 0)
242                 hlimit = ipv6_get_hoplimit(dst->dev);
243
244         tclass = -1;
245         if (np)
246                 tclass = np->tclass;
247         if (tclass < 0)
248                 tclass = 0;
249
250         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
251
252         hdr->payload_len = htons(seg_len);
253         hdr->nexthdr = proto;
254         hdr->hop_limit = hlimit;
255
256         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
257         ipv6_addr_copy(&hdr->daddr, first_hop);
258
259         skb->priority = sk->sk_priority;
260         skb->mark = sk->sk_mark;
261
262         mtu = dst_mtu(dst);
263         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
264                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
265                               IPSTATS_MIB_OUTREQUESTS);
266                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
267                                 dst_output);
268         }
269
270         if (net_ratelimit())
271                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
272         skb->dev = dst->dev;
273         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
274         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
275         kfree_skb(skb);
276         return -EMSGSIZE;
277 }
278
279 EXPORT_SYMBOL(ip6_xmit);
280
281 /*
282  *      To avoid extra problems ND packets are send through this
283  *      routine. It's code duplication but I really want to avoid
284  *      extra checks since ipv6_build_header is used by TCP (which
285  *      is for us performance critical)
286  */
287
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289                struct in6_addr *saddr, struct in6_addr *daddr,
290                int proto, int len)
291 {
292         struct ipv6_pinfo *np = inet6_sk(sk);
293         struct ipv6hdr *hdr;
294         int totlen;
295
296         skb->protocol = htons(ETH_P_IPV6);
297         skb->dev = dev;
298
299         totlen = len + sizeof(struct ipv6hdr);
300
301         skb_reset_network_header(skb);
302         skb_put(skb, sizeof(struct ipv6hdr));
303         hdr = ipv6_hdr(skb);
304
305         *(__be32*)hdr = htonl(0x60000000);
306
307         hdr->payload_len = htons(len);
308         hdr->nexthdr = proto;
309         hdr->hop_limit = np->hop_limit;
310
311         ipv6_addr_copy(&hdr->saddr, saddr);
312         ipv6_addr_copy(&hdr->daddr, daddr);
313
314         return 0;
315 }
316
317 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
318 {
319         struct ip6_ra_chain *ra;
320         struct sock *last = NULL;
321
322         read_lock(&ip6_ra_lock);
323         for (ra = ip6_ra_chain; ra; ra = ra->next) {
324                 struct sock *sk = ra->sk;
325                 if (sk && ra->sel == sel &&
326                     (!sk->sk_bound_dev_if ||
327                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
328                         if (last) {
329                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
330                                 if (skb2)
331                                         rawv6_rcv(last, skb2);
332                         }
333                         last = sk;
334                 }
335         }
336
337         if (last) {
338                 rawv6_rcv(last, skb);
339                 read_unlock(&ip6_ra_lock);
340                 return 1;
341         }
342         read_unlock(&ip6_ra_lock);
343         return 0;
344 }
345
346 static int ip6_forward_proxy_check(struct sk_buff *skb)
347 {
348         struct ipv6hdr *hdr = ipv6_hdr(skb);
349         u8 nexthdr = hdr->nexthdr;
350         int offset;
351
352         if (ipv6_ext_hdr(nexthdr)) {
353                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
354                 if (offset < 0)
355                         return 0;
356         } else
357                 offset = sizeof(struct ipv6hdr);
358
359         if (nexthdr == IPPROTO_ICMPV6) {
360                 struct icmp6hdr *icmp6;
361
362                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
363                                          offset + 1 - skb->data)))
364                         return 0;
365
366                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
367
368                 switch (icmp6->icmp6_type) {
369                 case NDISC_ROUTER_SOLICITATION:
370                 case NDISC_ROUTER_ADVERTISEMENT:
371                 case NDISC_NEIGHBOUR_SOLICITATION:
372                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
373                 case NDISC_REDIRECT:
374                         /* For reaction involving unicast neighbor discovery
375                          * message destined to the proxied address, pass it to
376                          * input function.
377                          */
378                         return 1;
379                 default:
380                         break;
381                 }
382         }
383
384         /*
385          * The proxying router can't forward traffic sent to a link-local
386          * address, so signal the sender and discard the packet. This
387          * behavior is clarified by the MIPv6 specification.
388          */
389         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
390                 dst_link_failure(skb);
391                 return -1;
392         }
393
394         return 0;
395 }
396
397 static inline int ip6_forward_finish(struct sk_buff *skb)
398 {
399         return dst_output(skb);
400 }
401
402 int ip6_forward(struct sk_buff *skb)
403 {
404         struct dst_entry *dst = skb->dst;
405         struct ipv6hdr *hdr = ipv6_hdr(skb);
406         struct inet6_skb_parm *opt = IP6CB(skb);
407
408         if (ipv6_devconf.forwarding == 0)
409                 goto error;
410
411         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
412                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
413                 goto drop;
414         }
415
416         skb_forward_csum(skb);
417
418         /*
419          *      We DO NOT make any processing on
420          *      RA packets, pushing them to user level AS IS
421          *      without ane WARRANTY that application will be able
422          *      to interpret them. The reason is that we
423          *      cannot make anything clever here.
424          *
425          *      We are not end-node, so that if packet contains
426          *      AH/ESP, we cannot make anything.
427          *      Defragmentation also would be mistake, RA packets
428          *      cannot be fragmented, because there is no warranty
429          *      that different fragments will go along one path. --ANK
430          */
431         if (opt->ra) {
432                 u8 *ptr = skb_network_header(skb) + opt->ra;
433                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
434                         return 0;
435         }
436
437         /*
438          *      check and decrement ttl
439          */
440         if (hdr->hop_limit <= 1) {
441                 /* Force OUTPUT device used as source address */
442                 skb->dev = dst->dev;
443                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
444                             0, skb->dev);
445                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
446
447                 kfree_skb(skb);
448                 return -ETIMEDOUT;
449         }
450
451         /* XXX: idev->cnf.proxy_ndp? */
452         if (ipv6_devconf.proxy_ndp &&
453             pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) {
454                 int proxied = ip6_forward_proxy_check(skb);
455                 if (proxied > 0)
456                         return ip6_input(skb);
457                 else if (proxied < 0) {
458                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
459                         goto drop;
460                 }
461         }
462
463         if (!xfrm6_route_forward(skb)) {
464                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
465                 goto drop;
466         }
467         dst = skb->dst;
468
469         /* IPv6 specs say nothing about it, but it is clear that we cannot
470            send redirects to source routed frames.
471            We don't send redirects to frames decapsulated from IPsec.
472          */
473         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
474             !skb->sp) {
475                 struct in6_addr *target = NULL;
476                 struct rt6_info *rt;
477                 struct neighbour *n = dst->neighbour;
478
479                 /*
480                  *      incoming and outgoing devices are the same
481                  *      send a redirect.
482                  */
483
484                 rt = (struct rt6_info *) dst;
485                 if ((rt->rt6i_flags & RTF_GATEWAY))
486                         target = (struct in6_addr*)&n->primary_key;
487                 else
488                         target = &hdr->daddr;
489
490                 /* Limit redirects both by destination (here)
491                    and by source (inside ndisc_send_redirect)
492                  */
493                 if (xrlim_allow(dst, 1*HZ))
494                         ndisc_send_redirect(skb, n, target);
495         } else {
496                 int addrtype = ipv6_addr_type(&hdr->saddr);
497
498                 /* This check is security critical. */
499                 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
500                         goto error;
501                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
502                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
503                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
504                         goto error;
505                 }
506         }
507
508         if (skb->len > dst_mtu(dst)) {
509                 /* Again, force OUTPUT device used as source address */
510                 skb->dev = dst->dev;
511                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
512                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
514                 kfree_skb(skb);
515                 return -EMSGSIZE;
516         }
517
518         if (skb_cow(skb, dst->dev->hard_header_len)) {
519                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
520                 goto drop;
521         }
522
523         hdr = ipv6_hdr(skb);
524
525         /* Mangling hops number delayed to point after skb COW */
526
527         hdr->hop_limit--;
528
529         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
530         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
531                        ip6_forward_finish);
532
533 error:
534         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
535 drop:
536         kfree_skb(skb);
537         return -EINVAL;
538 }
539
540 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 {
542         to->pkt_type = from->pkt_type;
543         to->priority = from->priority;
544         to->protocol = from->protocol;
545         dst_release(to->dst);
546         to->dst = dst_clone(from->dst);
547         to->dev = from->dev;
548         to->mark = from->mark;
549
550 #ifdef CONFIG_NET_SCHED
551         to->tc_index = from->tc_index;
552 #endif
553         nf_copy(to, from);
554 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
555     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
556         to->nf_trace = from->nf_trace;
557 #endif
558         skb_copy_secmark(to, from);
559 }
560
561 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 {
563         u16 offset = sizeof(struct ipv6hdr);
564         struct ipv6_opt_hdr *exthdr =
565                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
566         unsigned int packet_len = skb->tail - skb->network_header;
567         int found_rhdr = 0;
568         *nexthdr = &ipv6_hdr(skb)->nexthdr;
569
570         while (offset + 1 <= packet_len) {
571
572                 switch (**nexthdr) {
573
574                 case NEXTHDR_HOP:
575                         break;
576                 case NEXTHDR_ROUTING:
577                         found_rhdr = 1;
578                         break;
579                 case NEXTHDR_DEST:
580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
581                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
582                                 break;
583 #endif
584                         if (found_rhdr)
585                                 return offset;
586                         break;
587                 default :
588                         return offset;
589                 }
590
591                 offset += ipv6_optlen(exthdr);
592                 *nexthdr = &exthdr->nexthdr;
593                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
594                                                  offset);
595         }
596
597         return offset;
598 }
599
600 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
601 {
602         struct net_device *dev;
603         struct sk_buff *frag;
604         struct rt6_info *rt = (struct rt6_info*)skb->dst;
605         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
606         struct ipv6hdr *tmp_hdr;
607         struct frag_hdr *fh;
608         unsigned int mtu, hlen, left, len;
609         __be32 frag_id = 0;
610         int ptr, offset = 0, err=0;
611         u8 *prevhdr, nexthdr = 0;
612
613         dev = rt->u.dst.dev;
614         hlen = ip6_find_1stfragopt(skb, &prevhdr);
615         nexthdr = *prevhdr;
616
617         mtu = ip6_skb_dst_mtu(skb);
618
619         /* We must not fragment if the socket is set to force MTU discovery
620          * or if the skb it not generated by a local socket.  (This last
621          * check should be redundant, but it's free.)
622          */
623         if (!skb->local_df) {
624                 skb->dev = skb->dst->dev;
625                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
626                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
627                 kfree_skb(skb);
628                 return -EMSGSIZE;
629         }
630
631         if (np && np->frag_size < mtu) {
632                 if (np->frag_size)
633                         mtu = np->frag_size;
634         }
635         mtu -= hlen + sizeof(struct frag_hdr);
636
637         if (skb_shinfo(skb)->frag_list) {
638                 int first_len = skb_pagelen(skb);
639                 int truesizes = 0;
640
641                 if (first_len - hlen > mtu ||
642                     ((first_len - hlen) & 7) ||
643                     skb_cloned(skb))
644                         goto slow_path;
645
646                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
647                         /* Correct geometry. */
648                         if (frag->len > mtu ||
649                             ((frag->len & 7) && frag->next) ||
650                             skb_headroom(frag) < hlen)
651                             goto slow_path;
652
653                         /* Partially cloned skb? */
654                         if (skb_shared(frag))
655                                 goto slow_path;
656
657                         BUG_ON(frag->sk);
658                         if (skb->sk) {
659                                 sock_hold(skb->sk);
660                                 frag->sk = skb->sk;
661                                 frag->destructor = sock_wfree;
662                                 truesizes += frag->truesize;
663                         }
664                 }
665
666                 err = 0;
667                 offset = 0;
668                 frag = skb_shinfo(skb)->frag_list;
669                 skb_shinfo(skb)->frag_list = NULL;
670                 /* BUILD HEADER */
671
672                 *prevhdr = NEXTHDR_FRAGMENT;
673                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
674                 if (!tmp_hdr) {
675                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
676                         return -ENOMEM;
677                 }
678
679                 __skb_pull(skb, hlen);
680                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
681                 __skb_push(skb, hlen);
682                 skb_reset_network_header(skb);
683                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
684
685                 ipv6_select_ident(skb, fh);
686                 fh->nexthdr = nexthdr;
687                 fh->reserved = 0;
688                 fh->frag_off = htons(IP6_MF);
689                 frag_id = fh->identification;
690
691                 first_len = skb_pagelen(skb);
692                 skb->data_len = first_len - skb_headlen(skb);
693                 skb->truesize -= truesizes;
694                 skb->len = first_len;
695                 ipv6_hdr(skb)->payload_len = htons(first_len -
696                                                    sizeof(struct ipv6hdr));
697
698                 dst_hold(&rt->u.dst);
699
700                 for (;;) {
701                         /* Prepare header of the next frame,
702                          * before previous one went down. */
703                         if (frag) {
704                                 frag->ip_summed = CHECKSUM_NONE;
705                                 skb_reset_transport_header(frag);
706                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
707                                 __skb_push(frag, hlen);
708                                 skb_reset_network_header(frag);
709                                 memcpy(skb_network_header(frag), tmp_hdr,
710                                        hlen);
711                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
712                                 fh->nexthdr = nexthdr;
713                                 fh->reserved = 0;
714                                 fh->frag_off = htons(offset);
715                                 if (frag->next != NULL)
716                                         fh->frag_off |= htons(IP6_MF);
717                                 fh->identification = frag_id;
718                                 ipv6_hdr(frag)->payload_len =
719                                                 htons(frag->len -
720                                                       sizeof(struct ipv6hdr));
721                                 ip6_copy_metadata(frag, skb);
722                         }
723
724                         err = output(skb);
725                         if(!err)
726                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
727
728                         if (err || !frag)
729                                 break;
730
731                         skb = frag;
732                         frag = skb->next;
733                         skb->next = NULL;
734                 }
735
736                 kfree(tmp_hdr);
737
738                 if (err == 0) {
739                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
740                         dst_release(&rt->u.dst);
741                         return 0;
742                 }
743
744                 while (frag) {
745                         skb = frag->next;
746                         kfree_skb(frag);
747                         frag = skb;
748                 }
749
750                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
751                 dst_release(&rt->u.dst);
752                 return err;
753         }
754
755 slow_path:
756         left = skb->len - hlen;         /* Space per frame */
757         ptr = hlen;                     /* Where to start from */
758
759         /*
760          *      Fragment the datagram.
761          */
762
763         *prevhdr = NEXTHDR_FRAGMENT;
764
765         /*
766          *      Keep copying data until we run out.
767          */
768         while(left > 0) {
769                 len = left;
770                 /* IF: it doesn't fit, use 'mtu' - the data space left */
771                 if (len > mtu)
772                         len = mtu;
773                 /* IF: we are not sending upto and including the packet end
774                    then align the next start on an eight byte boundary */
775                 if (len < left) {
776                         len &= ~7;
777                 }
778                 /*
779                  *      Allocate buffer.
780                  */
781
782                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
783                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
784                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
785                                       IPSTATS_MIB_FRAGFAILS);
786                         err = -ENOMEM;
787                         goto fail;
788                 }
789
790                 /*
791                  *      Set up data on packet
792                  */
793
794                 ip6_copy_metadata(frag, skb);
795                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
796                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
797                 skb_reset_network_header(frag);
798                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
799                 frag->transport_header = (frag->network_header + hlen +
800                                           sizeof(struct frag_hdr));
801
802                 /*
803                  *      Charge the memory for the fragment to any owner
804                  *      it might possess
805                  */
806                 if (skb->sk)
807                         skb_set_owner_w(frag, skb->sk);
808
809                 /*
810                  *      Copy the packet header into the new buffer.
811                  */
812                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
813
814                 /*
815                  *      Build fragment header.
816                  */
817                 fh->nexthdr = nexthdr;
818                 fh->reserved = 0;
819                 if (!frag_id) {
820                         ipv6_select_ident(skb, fh);
821                         frag_id = fh->identification;
822                 } else
823                         fh->identification = frag_id;
824
825                 /*
826                  *      Copy a block of the IP datagram.
827                  */
828                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
829                         BUG();
830                 left -= len;
831
832                 fh->frag_off = htons(offset);
833                 if (left > 0)
834                         fh->frag_off |= htons(IP6_MF);
835                 ipv6_hdr(frag)->payload_len = htons(frag->len -
836                                                     sizeof(struct ipv6hdr));
837
838                 ptr += len;
839                 offset += len;
840
841                 /*
842                  *      Put this fragment into the sending queue.
843                  */
844                 err = output(frag);
845                 if (err)
846                         goto fail;
847
848                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
849         }
850         IP6_INC_STATS(ip6_dst_idev(skb->dst),
851                       IPSTATS_MIB_FRAGOKS);
852         kfree_skb(skb);
853         return err;
854
855 fail:
856         IP6_INC_STATS(ip6_dst_idev(skb->dst),
857                       IPSTATS_MIB_FRAGFAILS);
858         kfree_skb(skb);
859         return err;
860 }
861
862 static inline int ip6_rt_check(struct rt6key *rt_key,
863                                struct in6_addr *fl_addr,
864                                struct in6_addr *addr_cache)
865 {
866         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
867                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
868 }
869
870 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
871                                           struct dst_entry *dst,
872                                           struct flowi *fl)
873 {
874         struct ipv6_pinfo *np = inet6_sk(sk);
875         struct rt6_info *rt = (struct rt6_info *)dst;
876
877         if (!dst)
878                 goto out;
879
880         /* Yes, checking route validity in not connected
881          * case is not very simple. Take into account,
882          * that we do not support routing by source, TOS,
883          * and MSG_DONTROUTE            --ANK (980726)
884          *
885          * 1. ip6_rt_check(): If route was host route,
886          *    check that cached destination is current.
887          *    If it is network route, we still may
888          *    check its validity using saved pointer
889          *    to the last used address: daddr_cache.
890          *    We do not want to save whole address now,
891          *    (because main consumer of this service
892          *    is tcp, which has not this problem),
893          *    so that the last trick works only on connected
894          *    sockets.
895          * 2. oif also should be the same.
896          */
897         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
898 #ifdef CONFIG_IPV6_SUBTREES
899             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
900 #endif
901             (fl->oif && fl->oif != dst->dev->ifindex)) {
902                 dst_release(dst);
903                 dst = NULL;
904         }
905
906 out:
907         return dst;
908 }
909
910 static int ip6_dst_lookup_tail(struct sock *sk,
911                                struct dst_entry **dst, struct flowi *fl)
912 {
913         int err;
914
915         if (*dst == NULL)
916                 *dst = ip6_route_output(&init_net, sk, fl);
917
918         if ((err = (*dst)->error))
919                 goto out_err_release;
920
921         if (ipv6_addr_any(&fl->fl6_src)) {
922                 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
923                                          &fl->fl6_dst, &fl->fl6_src);
924                 if (err)
925                         goto out_err_release;
926         }
927
928 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
929                 /*
930                  * Here if the dst entry we've looked up
931                  * has a neighbour entry that is in the INCOMPLETE
932                  * state and the src address from the flow is
933                  * marked as OPTIMISTIC, we release the found
934                  * dst entry and replace it instead with the
935                  * dst entry of the nexthop router
936                  */
937                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
938                         struct inet6_ifaddr *ifp;
939                         struct flowi fl_gw;
940                         int redirect;
941
942                         ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src,
943                                               (*dst)->dev, 1);
944
945                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
946                         if (ifp)
947                                 in6_ifa_put(ifp);
948
949                         if (redirect) {
950                                 /*
951                                  * We need to get the dst entry for the
952                                  * default router instead
953                                  */
954                                 dst_release(*dst);
955                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
956                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
957                                 *dst = ip6_route_output(&init_net, sk, &fl_gw);
958                                 if ((err = (*dst)->error))
959                                         goto out_err_release;
960                         }
961                 }
962 #endif
963
964         return 0;
965
966 out_err_release:
967         if (err == -ENETUNREACH)
968                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
969         dst_release(*dst);
970         *dst = NULL;
971         return err;
972 }
973
974 /**
975  *      ip6_dst_lookup - perform route lookup on flow
976  *      @sk: socket which provides route info
977  *      @dst: pointer to dst_entry * for result
978  *      @fl: flow to lookup
979  *
980  *      This function performs a route lookup on the given flow.
981  *
982  *      It returns zero on success, or a standard errno code on error.
983  */
984 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
985 {
986         *dst = NULL;
987         return ip6_dst_lookup_tail(sk, dst, fl);
988 }
989 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
990
991 /**
992  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
993  *      @sk: socket which provides the dst cache and route info
994  *      @dst: pointer to dst_entry * for result
995  *      @fl: flow to lookup
996  *
997  *      This function performs a route lookup on the given flow with the
998  *      possibility of using the cached route in the socket if it is valid.
999  *      It will take the socket dst lock when operating on the dst cache.
1000  *      As a result, this function can only be used in process context.
1001  *
1002  *      It returns zero on success, or a standard errno code on error.
1003  */
1004 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1005 {
1006         *dst = NULL;
1007         if (sk) {
1008                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1009                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1010         }
1011
1012         return ip6_dst_lookup_tail(sk, dst, fl);
1013 }
1014 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1015
1016 static inline int ip6_ufo_append_data(struct sock *sk,
1017                         int getfrag(void *from, char *to, int offset, int len,
1018                         int odd, struct sk_buff *skb),
1019                         void *from, int length, int hh_len, int fragheaderlen,
1020                         int transhdrlen, int mtu,unsigned int flags)
1021
1022 {
1023         struct sk_buff *skb;
1024         int err;
1025
1026         /* There is support for UDP large send offload by network
1027          * device, so create one single skb packet containing complete
1028          * udp datagram
1029          */
1030         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1031                 skb = sock_alloc_send_skb(sk,
1032                         hh_len + fragheaderlen + transhdrlen + 20,
1033                         (flags & MSG_DONTWAIT), &err);
1034                 if (skb == NULL)
1035                         return -ENOMEM;
1036
1037                 /* reserve space for Hardware header */
1038                 skb_reserve(skb, hh_len);
1039
1040                 /* create space for UDP/IP header */
1041                 skb_put(skb,fragheaderlen + transhdrlen);
1042
1043                 /* initialize network header pointer */
1044                 skb_reset_network_header(skb);
1045
1046                 /* initialize protocol header pointer */
1047                 skb->transport_header = skb->network_header + fragheaderlen;
1048
1049                 skb->ip_summed = CHECKSUM_PARTIAL;
1050                 skb->csum = 0;
1051                 sk->sk_sndmsg_off = 0;
1052         }
1053
1054         err = skb_append_datato_frags(sk,skb, getfrag, from,
1055                                       (length - transhdrlen));
1056         if (!err) {
1057                 struct frag_hdr fhdr;
1058
1059                 /* specify the length of each IP datagram fragment*/
1060                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1061                                             sizeof(struct frag_hdr);
1062                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1063                 ipv6_select_ident(skb, &fhdr);
1064                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1065                 __skb_queue_tail(&sk->sk_write_queue, skb);
1066
1067                 return 0;
1068         }
1069         /* There is not enough support do UPD LSO,
1070          * so follow normal path
1071          */
1072         kfree_skb(skb);
1073
1074         return err;
1075 }
1076
1077 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1078         int offset, int len, int odd, struct sk_buff *skb),
1079         void *from, int length, int transhdrlen,
1080         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1081         struct rt6_info *rt, unsigned int flags)
1082 {
1083         struct inet_sock *inet = inet_sk(sk);
1084         struct ipv6_pinfo *np = inet6_sk(sk);
1085         struct sk_buff *skb;
1086         unsigned int maxfraglen, fragheaderlen;
1087         int exthdrlen;
1088         int hh_len;
1089         int mtu;
1090         int copy;
1091         int err;
1092         int offset = 0;
1093         int csummode = CHECKSUM_NONE;
1094
1095         if (flags&MSG_PROBE)
1096                 return 0;
1097         if (skb_queue_empty(&sk->sk_write_queue)) {
1098                 /*
1099                  * setup for corking
1100                  */
1101                 if (opt) {
1102                         if (np->cork.opt == NULL) {
1103                                 np->cork.opt = kmalloc(opt->tot_len,
1104                                                        sk->sk_allocation);
1105                                 if (unlikely(np->cork.opt == NULL))
1106                                         return -ENOBUFS;
1107                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1108                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1109                                 return -EINVAL;
1110                         }
1111                         memcpy(np->cork.opt, opt, opt->tot_len);
1112                         inet->cork.flags |= IPCORK_OPT;
1113                         /* need source address above miyazawa*/
1114                 }
1115                 dst_hold(&rt->u.dst);
1116                 np->cork.rt = rt;
1117                 inet->cork.fl = *fl;
1118                 np->cork.hop_limit = hlimit;
1119                 np->cork.tclass = tclass;
1120                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1121                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1122                 if (np->frag_size < mtu) {
1123                         if (np->frag_size)
1124                                 mtu = np->frag_size;
1125                 }
1126                 inet->cork.fragsize = mtu;
1127                 if (dst_allfrag(rt->u.dst.path))
1128                         inet->cork.flags |= IPCORK_ALLFRAG;
1129                 inet->cork.length = 0;
1130                 sk->sk_sndmsg_page = NULL;
1131                 sk->sk_sndmsg_off = 0;
1132                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1133                             rt->rt6i_nfheader_len;
1134                 length += exthdrlen;
1135                 transhdrlen += exthdrlen;
1136         } else {
1137                 rt = np->cork.rt;
1138                 fl = &inet->cork.fl;
1139                 if (inet->cork.flags & IPCORK_OPT)
1140                         opt = np->cork.opt;
1141                 transhdrlen = 0;
1142                 exthdrlen = 0;
1143                 mtu = inet->cork.fragsize;
1144         }
1145
1146         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1147
1148         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1149                         (opt ? opt->opt_nflen : 0);
1150         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1151
1152         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1153                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1154                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1155                         return -EMSGSIZE;
1156                 }
1157         }
1158
1159         /*
1160          * Let's try using as much space as possible.
1161          * Use MTU if total length of the message fits into the MTU.
1162          * Otherwise, we need to reserve fragment header and
1163          * fragment alignment (= 8-15 octects, in total).
1164          *
1165          * Note that we may need to "move" the data from the tail of
1166          * of the buffer to the new fragment when we split
1167          * the message.
1168          *
1169          * FIXME: It may be fragmented into multiple chunks
1170          *        at once if non-fragmentable extension headers
1171          *        are too large.
1172          * --yoshfuji
1173          */
1174
1175         inet->cork.length += length;
1176         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1177             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1178
1179                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1180                                           fragheaderlen, transhdrlen, mtu,
1181                                           flags);
1182                 if (err)
1183                         goto error;
1184                 return 0;
1185         }
1186
1187         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1188                 goto alloc_new_skb;
1189
1190         while (length > 0) {
1191                 /* Check if the remaining data fits into current packet. */
1192                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1193                 if (copy < length)
1194                         copy = maxfraglen - skb->len;
1195
1196                 if (copy <= 0) {
1197                         char *data;
1198                         unsigned int datalen;
1199                         unsigned int fraglen;
1200                         unsigned int fraggap;
1201                         unsigned int alloclen;
1202                         struct sk_buff *skb_prev;
1203 alloc_new_skb:
1204                         skb_prev = skb;
1205
1206                         /* There's no room in the current skb */
1207                         if (skb_prev)
1208                                 fraggap = skb_prev->len - maxfraglen;
1209                         else
1210                                 fraggap = 0;
1211
1212                         /*
1213                          * If remaining data exceeds the mtu,
1214                          * we know we need more fragment(s).
1215                          */
1216                         datalen = length + fraggap;
1217                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1218                                 datalen = maxfraglen - fragheaderlen;
1219
1220                         fraglen = datalen + fragheaderlen;
1221                         if ((flags & MSG_MORE) &&
1222                             !(rt->u.dst.dev->features&NETIF_F_SG))
1223                                 alloclen = mtu;
1224                         else
1225                                 alloclen = datalen + fragheaderlen;
1226
1227                         /*
1228                          * The last fragment gets additional space at tail.
1229                          * Note: we overallocate on fragments with MSG_MODE
1230                          * because we have no idea if we're the last one.
1231                          */
1232                         if (datalen == length + fraggap)
1233                                 alloclen += rt->u.dst.trailer_len;
1234
1235                         /*
1236                          * We just reserve space for fragment header.
1237                          * Note: this may be overallocation if the message
1238                          * (without MSG_MORE) fits into the MTU.
1239                          */
1240                         alloclen += sizeof(struct frag_hdr);
1241
1242                         if (transhdrlen) {
1243                                 skb = sock_alloc_send_skb(sk,
1244                                                 alloclen + hh_len,
1245                                                 (flags & MSG_DONTWAIT), &err);
1246                         } else {
1247                                 skb = NULL;
1248                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1249                                     2 * sk->sk_sndbuf)
1250                                         skb = sock_wmalloc(sk,
1251                                                            alloclen + hh_len, 1,
1252                                                            sk->sk_allocation);
1253                                 if (unlikely(skb == NULL))
1254                                         err = -ENOBUFS;
1255                         }
1256                         if (skb == NULL)
1257                                 goto error;
1258                         /*
1259                          *      Fill in the control structures
1260                          */
1261                         skb->ip_summed = csummode;
1262                         skb->csum = 0;
1263                         /* reserve for fragmentation */
1264                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1265
1266                         /*
1267                          *      Find where to start putting bytes
1268                          */
1269                         data = skb_put(skb, fraglen);
1270                         skb_set_network_header(skb, exthdrlen);
1271                         data += fragheaderlen;
1272                         skb->transport_header = (skb->network_header +
1273                                                  fragheaderlen);
1274                         if (fraggap) {
1275                                 skb->csum = skb_copy_and_csum_bits(
1276                                         skb_prev, maxfraglen,
1277                                         data + transhdrlen, fraggap, 0);
1278                                 skb_prev->csum = csum_sub(skb_prev->csum,
1279                                                           skb->csum);
1280                                 data += fraggap;
1281                                 pskb_trim_unique(skb_prev, maxfraglen);
1282                         }
1283                         copy = datalen - transhdrlen - fraggap;
1284                         if (copy < 0) {
1285                                 err = -EINVAL;
1286                                 kfree_skb(skb);
1287                                 goto error;
1288                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1289                                 err = -EFAULT;
1290                                 kfree_skb(skb);
1291                                 goto error;
1292                         }
1293
1294                         offset += copy;
1295                         length -= datalen - fraggap;
1296                         transhdrlen = 0;
1297                         exthdrlen = 0;
1298                         csummode = CHECKSUM_NONE;
1299
1300                         /*
1301                          * Put the packet on the pending queue
1302                          */
1303                         __skb_queue_tail(&sk->sk_write_queue, skb);
1304                         continue;
1305                 }
1306
1307                 if (copy > length)
1308                         copy = length;
1309
1310                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1311                         unsigned int off;
1312
1313                         off = skb->len;
1314                         if (getfrag(from, skb_put(skb, copy),
1315                                                 offset, copy, off, skb) < 0) {
1316                                 __skb_trim(skb, off);
1317                                 err = -EFAULT;
1318                                 goto error;
1319                         }
1320                 } else {
1321                         int i = skb_shinfo(skb)->nr_frags;
1322                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1323                         struct page *page = sk->sk_sndmsg_page;
1324                         int off = sk->sk_sndmsg_off;
1325                         unsigned int left;
1326
1327                         if (page && (left = PAGE_SIZE - off) > 0) {
1328                                 if (copy >= left)
1329                                         copy = left;
1330                                 if (page != frag->page) {
1331                                         if (i == MAX_SKB_FRAGS) {
1332                                                 err = -EMSGSIZE;
1333                                                 goto error;
1334                                         }
1335                                         get_page(page);
1336                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1337                                         frag = &skb_shinfo(skb)->frags[i];
1338                                 }
1339                         } else if(i < MAX_SKB_FRAGS) {
1340                                 if (copy > PAGE_SIZE)
1341                                         copy = PAGE_SIZE;
1342                                 page = alloc_pages(sk->sk_allocation, 0);
1343                                 if (page == NULL) {
1344                                         err = -ENOMEM;
1345                                         goto error;
1346                                 }
1347                                 sk->sk_sndmsg_page = page;
1348                                 sk->sk_sndmsg_off = 0;
1349
1350                                 skb_fill_page_desc(skb, i, page, 0, 0);
1351                                 frag = &skb_shinfo(skb)->frags[i];
1352                         } else {
1353                                 err = -EMSGSIZE;
1354                                 goto error;
1355                         }
1356                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1357                                 err = -EFAULT;
1358                                 goto error;
1359                         }
1360                         sk->sk_sndmsg_off += copy;
1361                         frag->size += copy;
1362                         skb->len += copy;
1363                         skb->data_len += copy;
1364                         skb->truesize += copy;
1365                         atomic_add(copy, &sk->sk_wmem_alloc);
1366                 }
1367                 offset += copy;
1368                 length -= copy;
1369         }
1370         return 0;
1371 error:
1372         inet->cork.length -= length;
1373         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1374         return err;
1375 }
1376
1377 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1378 {
1379         inet->cork.flags &= ~IPCORK_OPT;
1380         kfree(np->cork.opt);
1381         np->cork.opt = NULL;
1382         if (np->cork.rt) {
1383                 dst_release(&np->cork.rt->u.dst);
1384                 np->cork.rt = NULL;
1385                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1386         }
1387         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1388 }
1389
1390 int ip6_push_pending_frames(struct sock *sk)
1391 {
1392         struct sk_buff *skb, *tmp_skb;
1393         struct sk_buff **tail_skb;
1394         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1395         struct inet_sock *inet = inet_sk(sk);
1396         struct ipv6_pinfo *np = inet6_sk(sk);
1397         struct ipv6hdr *hdr;
1398         struct ipv6_txoptions *opt = np->cork.opt;
1399         struct rt6_info *rt = np->cork.rt;
1400         struct flowi *fl = &inet->cork.fl;
1401         unsigned char proto = fl->proto;
1402         int err = 0;
1403
1404         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1405                 goto out;
1406         tail_skb = &(skb_shinfo(skb)->frag_list);
1407
1408         /* move skb->data to ip header from ext header */
1409         if (skb->data < skb_network_header(skb))
1410                 __skb_pull(skb, skb_network_offset(skb));
1411         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1412                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1413                 *tail_skb = tmp_skb;
1414                 tail_skb = &(tmp_skb->next);
1415                 skb->len += tmp_skb->len;
1416                 skb->data_len += tmp_skb->len;
1417                 skb->truesize += tmp_skb->truesize;
1418                 __sock_put(tmp_skb->sk);
1419                 tmp_skb->destructor = NULL;
1420                 tmp_skb->sk = NULL;
1421         }
1422
1423         /* Allow local fragmentation. */
1424         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1425                 skb->local_df = 1;
1426
1427         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1428         __skb_pull(skb, skb_network_header_len(skb));
1429         if (opt && opt->opt_flen)
1430                 ipv6_push_frag_opts(skb, opt, &proto);
1431         if (opt && opt->opt_nflen)
1432                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1433
1434         skb_push(skb, sizeof(struct ipv6hdr));
1435         skb_reset_network_header(skb);
1436         hdr = ipv6_hdr(skb);
1437
1438         *(__be32*)hdr = fl->fl6_flowlabel |
1439                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1440
1441         hdr->hop_limit = np->cork.hop_limit;
1442         hdr->nexthdr = proto;
1443         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1444         ipv6_addr_copy(&hdr->daddr, final_dst);
1445
1446         skb->priority = sk->sk_priority;
1447         skb->mark = sk->sk_mark;
1448
1449         skb->dst = dst_clone(&rt->u.dst);
1450         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1451         if (proto == IPPROTO_ICMPV6) {
1452                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1453
1454                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1455                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1456         }
1457
1458         err = ip6_local_out(skb);
1459         if (err) {
1460                 if (err > 0)
1461                         err = np->recverr ? net_xmit_errno(err) : 0;
1462                 if (err)
1463                         goto error;
1464         }
1465
1466 out:
1467         ip6_cork_release(inet, np);
1468         return err;
1469 error:
1470         goto out;
1471 }
1472
1473 void ip6_flush_pending_frames(struct sock *sk)
1474 {
1475         struct sk_buff *skb;
1476
1477         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1478                 if (skb->dst)
1479                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1480                                       IPSTATS_MIB_OUTDISCARDS);
1481                 kfree_skb(skb);
1482         }
1483
1484         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1485 }