[NETFILTER]: reduce netfilter sk_buff enlargement
[safe/jmp/linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159         struct ipv6hdr *iph = skb->nh.ipv6h;
160         struct dst_entry *dst;
161         struct flowi fl = {
162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163                 .nl_u =
164                 { .ip6_u =
165                   { .daddr = iph->daddr,
166                     .saddr = iph->saddr, } },
167                 .proto = iph->nexthdr,
168         };
169
170         dst = ip6_route_output(skb->sk, &fl);
171
172         if (dst->error) {
173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174                 LIMIT_NETDEBUG(
175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176                 dst_release(dst);
177                 return -EINVAL;
178         }
179
180         /* Drop old route. */
181         dst_release(skb->dst);
182
183         skb->dst = dst;
184         return 0;
185 }
186 #endif
187
188 /*
189  *      xmit an sk_buff (used by TCP)
190  */
191
192 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
193              struct ipv6_txoptions *opt, int ipfragok)
194 {
195         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
196         struct in6_addr *first_hop = &fl->fl6_dst;
197         struct dst_entry *dst = skb->dst;
198         struct ipv6hdr *hdr;
199         u8  proto = fl->proto;
200         int seg_len = skb->len;
201         int hlimit;
202         u32 mtu;
203
204         if (opt) {
205                 int head_room;
206
207                 /* First: exthdrs may take lots of space (~8K for now)
208                    MAX_HEADER is not enough.
209                  */
210                 head_room = opt->opt_nflen + opt->opt_flen;
211                 seg_len += head_room;
212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214                 if (skb_headroom(skb) < head_room) {
215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216                         kfree_skb(skb);
217                         skb = skb2;
218                         if (skb == NULL) {      
219                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
220                                 return -ENOBUFS;
221                         }
222                         if (sk)
223                                 skb_set_owner_w(skb, sk);
224                 }
225                 if (opt->opt_flen)
226                         ipv6_push_frag_opts(skb, opt, &proto);
227                 if (opt->opt_nflen)
228                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
229         }
230
231         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
232
233         /*
234          *      Fill in the IPv6 header
235          */
236
237         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
238         hlimit = -1;
239         if (np)
240                 hlimit = np->hop_limit;
241         if (hlimit < 0)
242                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
243         if (hlimit < 0)
244                 hlimit = ipv6_get_hoplimit(dst->dev);
245
246         hdr->payload_len = htons(seg_len);
247         hdr->nexthdr = proto;
248         hdr->hop_limit = hlimit;
249
250         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
251         ipv6_addr_copy(&hdr->daddr, first_hop);
252
253         mtu = dst_mtu(dst);
254         if ((skb->len <= mtu) || ipfragok) {
255                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
256                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
257                                 dst_output);
258         }
259
260         if (net_ratelimit())
261                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
262         skb->dev = dst->dev;
263         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
264         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
265         kfree_skb(skb);
266         return -EMSGSIZE;
267 }
268
269 /*
270  *      To avoid extra problems ND packets are send through this
271  *      routine. It's code duplication but I really want to avoid
272  *      extra checks since ipv6_build_header is used by TCP (which
273  *      is for us performance critical)
274  */
275
276 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
277                struct in6_addr *saddr, struct in6_addr *daddr,
278                int proto, int len)
279 {
280         struct ipv6_pinfo *np = inet6_sk(sk);
281         struct ipv6hdr *hdr;
282         int totlen;
283
284         skb->protocol = htons(ETH_P_IPV6);
285         skb->dev = dev;
286
287         totlen = len + sizeof(struct ipv6hdr);
288
289         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
290         skb->nh.ipv6h = hdr;
291
292         *(u32*)hdr = htonl(0x60000000);
293
294         hdr->payload_len = htons(len);
295         hdr->nexthdr = proto;
296         hdr->hop_limit = np->hop_limit;
297
298         ipv6_addr_copy(&hdr->saddr, saddr);
299         ipv6_addr_copy(&hdr->daddr, daddr);
300
301         return 0;
302 }
303
304 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
305 {
306         struct ip6_ra_chain *ra;
307         struct sock *last = NULL;
308
309         read_lock(&ip6_ra_lock);
310         for (ra = ip6_ra_chain; ra; ra = ra->next) {
311                 struct sock *sk = ra->sk;
312                 if (sk && ra->sel == sel) {
313                         if (last) {
314                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
315                                 if (skb2)
316                                         rawv6_rcv(last, skb2);
317                         }
318                         last = sk;
319                 }
320         }
321
322         if (last) {
323                 rawv6_rcv(last, skb);
324                 read_unlock(&ip6_ra_lock);
325                 return 1;
326         }
327         read_unlock(&ip6_ra_lock);
328         return 0;
329 }
330
331 static inline int ip6_forward_finish(struct sk_buff *skb)
332 {
333         return dst_output(skb);
334 }
335
336 int ip6_forward(struct sk_buff *skb)
337 {
338         struct dst_entry *dst = skb->dst;
339         struct ipv6hdr *hdr = skb->nh.ipv6h;
340         struct inet6_skb_parm *opt = IP6CB(skb);
341         
342         if (ipv6_devconf.forwarding == 0)
343                 goto error;
344
345         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
346                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
347                 goto drop;
348         }
349
350         skb->ip_summed = CHECKSUM_NONE;
351
352         /*
353          *      We DO NOT make any processing on
354          *      RA packets, pushing them to user level AS IS
355          *      without ane WARRANTY that application will be able
356          *      to interpret them. The reason is that we
357          *      cannot make anything clever here.
358          *
359          *      We are not end-node, so that if packet contains
360          *      AH/ESP, we cannot make anything.
361          *      Defragmentation also would be mistake, RA packets
362          *      cannot be fragmented, because there is no warranty
363          *      that different fragments will go along one path. --ANK
364          */
365         if (opt->ra) {
366                 u8 *ptr = skb->nh.raw + opt->ra;
367                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
368                         return 0;
369         }
370
371         /*
372          *      check and decrement ttl
373          */
374         if (hdr->hop_limit <= 1) {
375                 /* Force OUTPUT device used as source address */
376                 skb->dev = dst->dev;
377                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
378                             0, skb->dev);
379
380                 kfree_skb(skb);
381                 return -ETIMEDOUT;
382         }
383
384         if (!xfrm6_route_forward(skb)) {
385                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
386                 goto drop;
387         }
388         dst = skb->dst;
389
390         /* IPv6 specs say nothing about it, but it is clear that we cannot
391            send redirects to source routed frames.
392          */
393         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
394                 struct in6_addr *target = NULL;
395                 struct rt6_info *rt;
396                 struct neighbour *n = dst->neighbour;
397
398                 /*
399                  *      incoming and outgoing devices are the same
400                  *      send a redirect.
401                  */
402
403                 rt = (struct rt6_info *) dst;
404                 if ((rt->rt6i_flags & RTF_GATEWAY))
405                         target = (struct in6_addr*)&n->primary_key;
406                 else
407                         target = &hdr->daddr;
408
409                 /* Limit redirects both by destination (here)
410                    and by source (inside ndisc_send_redirect)
411                  */
412                 if (xrlim_allow(dst, 1*HZ))
413                         ndisc_send_redirect(skb, n, target);
414         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
415                                                 |IPV6_ADDR_LINKLOCAL)) {
416                 /* This check is security critical. */
417                 goto error;
418         }
419
420         if (skb->len > dst_mtu(dst)) {
421                 /* Again, force OUTPUT device used as source address */
422                 skb->dev = dst->dev;
423                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
424                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
425                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
426                 kfree_skb(skb);
427                 return -EMSGSIZE;
428         }
429
430         if (skb_cow(skb, dst->dev->hard_header_len)) {
431                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
432                 goto drop;
433         }
434
435         hdr = skb->nh.ipv6h;
436
437         /* Mangling hops number delayed to point after skb COW */
438  
439         hdr->hop_limit--;
440
441         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
442         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
443
444 error:
445         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
446 drop:
447         kfree_skb(skb);
448         return -EINVAL;
449 }
450
451 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
452 {
453         to->pkt_type = from->pkt_type;
454         to->priority = from->priority;
455         to->protocol = from->protocol;
456         dst_release(to->dst);
457         to->dst = dst_clone(from->dst);
458         to->dev = from->dev;
459
460 #ifdef CONFIG_NET_SCHED
461         to->tc_index = from->tc_index;
462 #endif
463 #ifdef CONFIG_NETFILTER
464         to->nfmark = from->nfmark;
465         /* Connection association is same as pre-frag packet */
466         to->nfct = from->nfct;
467         nf_conntrack_get(to->nfct);
468         to->nfctinfo = from->nfctinfo;
469 #ifdef CONFIG_BRIDGE_NETFILTER
470         nf_bridge_put(to->nf_bridge);
471         to->nf_bridge = from->nf_bridge;
472         nf_bridge_get(to->nf_bridge);
473 #endif
474 #endif
475 }
476
477 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
478 {
479         u16 offset = sizeof(struct ipv6hdr);
480         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
481         unsigned int packet_len = skb->tail - skb->nh.raw;
482         int found_rhdr = 0;
483         *nexthdr = &skb->nh.ipv6h->nexthdr;
484
485         while (offset + 1 <= packet_len) {
486
487                 switch (**nexthdr) {
488
489                 case NEXTHDR_HOP:
490                 case NEXTHDR_ROUTING:
491                 case NEXTHDR_DEST:
492                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
493                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
494                         offset += ipv6_optlen(exthdr);
495                         *nexthdr = &exthdr->nexthdr;
496                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
497                         break;
498                 default :
499                         return offset;
500                 }
501         }
502
503         return offset;
504 }
505
506 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
507 {
508         struct net_device *dev;
509         struct sk_buff *frag;
510         struct rt6_info *rt = (struct rt6_info*)skb->dst;
511         struct ipv6hdr *tmp_hdr;
512         struct frag_hdr *fh;
513         unsigned int mtu, hlen, left, len;
514         u32 frag_id = 0;
515         int ptr, offset = 0, err=0;
516         u8 *prevhdr, nexthdr = 0;
517
518         dev = rt->u.dst.dev;
519         hlen = ip6_find_1stfragopt(skb, &prevhdr);
520         nexthdr = *prevhdr;
521
522         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
523
524         if (skb_shinfo(skb)->frag_list) {
525                 int first_len = skb_pagelen(skb);
526
527                 if (first_len - hlen > mtu ||
528                     ((first_len - hlen) & 7) ||
529                     skb_cloned(skb))
530                         goto slow_path;
531
532                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
533                         /* Correct geometry. */
534                         if (frag->len > mtu ||
535                             ((frag->len & 7) && frag->next) ||
536                             skb_headroom(frag) < hlen)
537                             goto slow_path;
538
539                         /* Partially cloned skb? */
540                         if (skb_shared(frag))
541                                 goto slow_path;
542
543                         BUG_ON(frag->sk);
544                         if (skb->sk) {
545                                 sock_hold(skb->sk);
546                                 frag->sk = skb->sk;
547                                 frag->destructor = sock_wfree;
548                                 skb->truesize -= frag->truesize;
549                         }
550                 }
551
552                 err = 0;
553                 offset = 0;
554                 frag = skb_shinfo(skb)->frag_list;
555                 skb_shinfo(skb)->frag_list = NULL;
556                 /* BUILD HEADER */
557
558                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
559                 if (!tmp_hdr) {
560                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
561                         return -ENOMEM;
562                 }
563
564                 *prevhdr = NEXTHDR_FRAGMENT;
565                 memcpy(tmp_hdr, skb->nh.raw, hlen);
566                 __skb_pull(skb, hlen);
567                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
568                 skb->nh.raw = __skb_push(skb, hlen);
569                 memcpy(skb->nh.raw, tmp_hdr, hlen);
570
571                 ipv6_select_ident(skb, fh);
572                 fh->nexthdr = nexthdr;
573                 fh->reserved = 0;
574                 fh->frag_off = htons(IP6_MF);
575                 frag_id = fh->identification;
576
577                 first_len = skb_pagelen(skb);
578                 skb->data_len = first_len - skb_headlen(skb);
579                 skb->len = first_len;
580                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
581  
582
583                 for (;;) {
584                         /* Prepare header of the next frame,
585                          * before previous one went down. */
586                         if (frag) {
587                                 frag->ip_summed = CHECKSUM_NONE;
588                                 frag->h.raw = frag->data;
589                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
590                                 frag->nh.raw = __skb_push(frag, hlen);
591                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
592                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
593                                 fh->nexthdr = nexthdr;
594                                 fh->reserved = 0;
595                                 fh->frag_off = htons(offset);
596                                 if (frag->next != NULL)
597                                         fh->frag_off |= htons(IP6_MF);
598                                 fh->identification = frag_id;
599                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
600                                 ip6_copy_metadata(frag, skb);
601                         }
602                         
603                         err = output(skb);
604                         if (err || !frag)
605                                 break;
606
607                         skb = frag;
608                         frag = skb->next;
609                         skb->next = NULL;
610                 }
611
612                 if (tmp_hdr)
613                         kfree(tmp_hdr);
614
615                 if (err == 0) {
616                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
617                         return 0;
618                 }
619
620                 while (frag) {
621                         skb = frag->next;
622                         kfree_skb(frag);
623                         frag = skb;
624                 }
625
626                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
627                 return err;
628         }
629
630 slow_path:
631         left = skb->len - hlen;         /* Space per frame */
632         ptr = hlen;                     /* Where to start from */
633
634         /*
635          *      Fragment the datagram.
636          */
637
638         *prevhdr = NEXTHDR_FRAGMENT;
639
640         /*
641          *      Keep copying data until we run out.
642          */
643         while(left > 0) {
644                 len = left;
645                 /* IF: it doesn't fit, use 'mtu' - the data space left */
646                 if (len > mtu)
647                         len = mtu;
648                 /* IF: we are not sending upto and including the packet end
649                    then align the next start on an eight byte boundary */
650                 if (len < left) {
651                         len &= ~7;
652                 }
653                 /*
654                  *      Allocate buffer.
655                  */
656
657                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
658                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
659                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
660                         err = -ENOMEM;
661                         goto fail;
662                 }
663
664                 /*
665                  *      Set up data on packet
666                  */
667
668                 ip6_copy_metadata(frag, skb);
669                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
670                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
671                 frag->nh.raw = frag->data;
672                 fh = (struct frag_hdr*)(frag->data + hlen);
673                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
674
675                 /*
676                  *      Charge the memory for the fragment to any owner
677                  *      it might possess
678                  */
679                 if (skb->sk)
680                         skb_set_owner_w(frag, skb->sk);
681
682                 /*
683                  *      Copy the packet header into the new buffer.
684                  */
685                 memcpy(frag->nh.raw, skb->data, hlen);
686
687                 /*
688                  *      Build fragment header.
689                  */
690                 fh->nexthdr = nexthdr;
691                 fh->reserved = 0;
692                 if (frag_id) {
693                         ipv6_select_ident(skb, fh);
694                         frag_id = fh->identification;
695                 } else
696                         fh->identification = frag_id;
697
698                 /*
699                  *      Copy a block of the IP datagram.
700                  */
701                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
702                         BUG();
703                 left -= len;
704
705                 fh->frag_off = htons(offset);
706                 if (left > 0)
707                         fh->frag_off |= htons(IP6_MF);
708                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
709
710                 ptr += len;
711                 offset += len;
712
713                 /*
714                  *      Put this fragment into the sending queue.
715                  */
716
717                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
718
719                 err = output(frag);
720                 if (err)
721                         goto fail;
722         }
723         kfree_skb(skb);
724         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
725         return err;
726
727 fail:
728         kfree_skb(skb); 
729         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
730         return err;
731 }
732
733 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
734 {
735         int err = 0;
736
737         *dst = NULL;
738         if (sk) {
739                 struct ipv6_pinfo *np = inet6_sk(sk);
740         
741                 *dst = sk_dst_check(sk, np->dst_cookie);
742                 if (*dst) {
743                         struct rt6_info *rt = (struct rt6_info*)*dst;
744         
745                                 /* Yes, checking route validity in not connected
746                                    case is not very simple. Take into account,
747                                    that we do not support routing by source, TOS,
748                                    and MSG_DONTROUTE            --ANK (980726)
749         
750                                    1. If route was host route, check that
751                                       cached destination is current.
752                                       If it is network route, we still may
753                                       check its validity using saved pointer
754                                       to the last used address: daddr_cache.
755                                       We do not want to save whole address now,
756                                       (because main consumer of this service
757                                        is tcp, which has not this problem),
758                                       so that the last trick works only on connected
759                                       sockets.
760                                    2. oif also should be the same.
761                                  */
762         
763                         if (((rt->rt6i_dst.plen != 128 ||
764                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
765                              && (np->daddr_cache == NULL ||
766                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
767                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
768                                 dst_release(*dst);
769                                 *dst = NULL;
770                         }
771                 }
772         }
773
774         if (*dst == NULL)
775                 *dst = ip6_route_output(sk, fl);
776
777         if ((err = (*dst)->error))
778                 goto out_err_release;
779
780         if (ipv6_addr_any(&fl->fl6_src)) {
781                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
782
783                 if (err)
784                         goto out_err_release;
785         }
786
787         return 0;
788
789 out_err_release:
790         dst_release(*dst);
791         *dst = NULL;
792         return err;
793 }
794
795 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
796                     void *from, int length, int transhdrlen,
797                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
798                     unsigned int flags)
799 {
800         struct inet_sock *inet = inet_sk(sk);
801         struct ipv6_pinfo *np = inet6_sk(sk);
802         struct sk_buff *skb;
803         unsigned int maxfraglen, fragheaderlen;
804         int exthdrlen;
805         int hh_len;
806         int mtu;
807         int copy;
808         int err;
809         int offset = 0;
810         int csummode = CHECKSUM_NONE;
811
812         if (flags&MSG_PROBE)
813                 return 0;
814         if (skb_queue_empty(&sk->sk_write_queue)) {
815                 /*
816                  * setup for corking
817                  */
818                 if (opt) {
819                         if (np->cork.opt == NULL) {
820                                 np->cork.opt = kmalloc(opt->tot_len,
821                                                        sk->sk_allocation);
822                                 if (unlikely(np->cork.opt == NULL))
823                                         return -ENOBUFS;
824                         } else if (np->cork.opt->tot_len < opt->tot_len) {
825                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
826                                 return -EINVAL;
827                         }
828                         memcpy(np->cork.opt, opt, opt->tot_len);
829                         inet->cork.flags |= IPCORK_OPT;
830                         /* need source address above miyazawa*/
831                 }
832                 dst_hold(&rt->u.dst);
833                 np->cork.rt = rt;
834                 inet->cork.fl = *fl;
835                 np->cork.hop_limit = hlimit;
836                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
837                 if (dst_allfrag(rt->u.dst.path))
838                         inet->cork.flags |= IPCORK_ALLFRAG;
839                 inet->cork.length = 0;
840                 sk->sk_sndmsg_page = NULL;
841                 sk->sk_sndmsg_off = 0;
842                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
843                 length += exthdrlen;
844                 transhdrlen += exthdrlen;
845         } else {
846                 rt = np->cork.rt;
847                 fl = &inet->cork.fl;
848                 if (inet->cork.flags & IPCORK_OPT)
849                         opt = np->cork.opt;
850                 transhdrlen = 0;
851                 exthdrlen = 0;
852                 mtu = inet->cork.fragsize;
853         }
854
855         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
856
857         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
858         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
859
860         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
861                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
862                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
863                         return -EMSGSIZE;
864                 }
865         }
866
867         /*
868          * Let's try using as much space as possible.
869          * Use MTU if total length of the message fits into the MTU.
870          * Otherwise, we need to reserve fragment header and
871          * fragment alignment (= 8-15 octects, in total).
872          *
873          * Note that we may need to "move" the data from the tail of
874          * of the buffer to the new fragment when we split 
875          * the message.
876          *
877          * FIXME: It may be fragmented into multiple chunks 
878          *        at once if non-fragmentable extension headers
879          *        are too large.
880          * --yoshfuji 
881          */
882
883         inet->cork.length += length;
884
885         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
886                 goto alloc_new_skb;
887
888         while (length > 0) {
889                 /* Check if the remaining data fits into current packet. */
890                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
891                 if (copy < length)
892                         copy = maxfraglen - skb->len;
893
894                 if (copy <= 0) {
895                         char *data;
896                         unsigned int datalen;
897                         unsigned int fraglen;
898                         unsigned int fraggap;
899                         unsigned int alloclen;
900                         struct sk_buff *skb_prev;
901 alloc_new_skb:
902                         skb_prev = skb;
903
904                         /* There's no room in the current skb */
905                         if (skb_prev)
906                                 fraggap = skb_prev->len - maxfraglen;
907                         else
908                                 fraggap = 0;
909
910                         /*
911                          * If remaining data exceeds the mtu,
912                          * we know we need more fragment(s).
913                          */
914                         datalen = length + fraggap;
915                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
916                                 datalen = maxfraglen - fragheaderlen;
917
918                         fraglen = datalen + fragheaderlen;
919                         if ((flags & MSG_MORE) &&
920                             !(rt->u.dst.dev->features&NETIF_F_SG))
921                                 alloclen = mtu;
922                         else
923                                 alloclen = datalen + fragheaderlen;
924
925                         /*
926                          * The last fragment gets additional space at tail.
927                          * Note: we overallocate on fragments with MSG_MODE
928                          * because we have no idea if we're the last one.
929                          */
930                         if (datalen == length + fraggap)
931                                 alloclen += rt->u.dst.trailer_len;
932
933                         /*
934                          * We just reserve space for fragment header.
935                          * Note: this may be overallocation if the message 
936                          * (without MSG_MORE) fits into the MTU.
937                          */
938                         alloclen += sizeof(struct frag_hdr);
939
940                         if (transhdrlen) {
941                                 skb = sock_alloc_send_skb(sk,
942                                                 alloclen + hh_len,
943                                                 (flags & MSG_DONTWAIT), &err);
944                         } else {
945                                 skb = NULL;
946                                 if (atomic_read(&sk->sk_wmem_alloc) <=
947                                     2 * sk->sk_sndbuf)
948                                         skb = sock_wmalloc(sk,
949                                                            alloclen + hh_len, 1,
950                                                            sk->sk_allocation);
951                                 if (unlikely(skb == NULL))
952                                         err = -ENOBUFS;
953                         }
954                         if (skb == NULL)
955                                 goto error;
956                         /*
957                          *      Fill in the control structures
958                          */
959                         skb->ip_summed = csummode;
960                         skb->csum = 0;
961                         /* reserve for fragmentation */
962                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
963
964                         /*
965                          *      Find where to start putting bytes
966                          */
967                         data = skb_put(skb, fraglen);
968                         skb->nh.raw = data + exthdrlen;
969                         data += fragheaderlen;
970                         skb->h.raw = data + exthdrlen;
971
972                         if (fraggap) {
973                                 skb->csum = skb_copy_and_csum_bits(
974                                         skb_prev, maxfraglen,
975                                         data + transhdrlen, fraggap, 0);
976                                 skb_prev->csum = csum_sub(skb_prev->csum,
977                                                           skb->csum);
978                                 data += fraggap;
979                                 skb_trim(skb_prev, maxfraglen);
980                         }
981                         copy = datalen - transhdrlen - fraggap;
982                         if (copy < 0) {
983                                 err = -EINVAL;
984                                 kfree_skb(skb);
985                                 goto error;
986                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
987                                 err = -EFAULT;
988                                 kfree_skb(skb);
989                                 goto error;
990                         }
991
992                         offset += copy;
993                         length -= datalen - fraggap;
994                         transhdrlen = 0;
995                         exthdrlen = 0;
996                         csummode = CHECKSUM_NONE;
997
998                         /*
999                          * Put the packet on the pending queue
1000                          */
1001                         __skb_queue_tail(&sk->sk_write_queue, skb);
1002                         continue;
1003                 }
1004
1005                 if (copy > length)
1006                         copy = length;
1007
1008                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1009                         unsigned int off;
1010
1011                         off = skb->len;
1012                         if (getfrag(from, skb_put(skb, copy),
1013                                                 offset, copy, off, skb) < 0) {
1014                                 __skb_trim(skb, off);
1015                                 err = -EFAULT;
1016                                 goto error;
1017                         }
1018                 } else {
1019                         int i = skb_shinfo(skb)->nr_frags;
1020                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1021                         struct page *page = sk->sk_sndmsg_page;
1022                         int off = sk->sk_sndmsg_off;
1023                         unsigned int left;
1024
1025                         if (page && (left = PAGE_SIZE - off) > 0) {
1026                                 if (copy >= left)
1027                                         copy = left;
1028                                 if (page != frag->page) {
1029                                         if (i == MAX_SKB_FRAGS) {
1030                                                 err = -EMSGSIZE;
1031                                                 goto error;
1032                                         }
1033                                         get_page(page);
1034                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1035                                         frag = &skb_shinfo(skb)->frags[i];
1036                                 }
1037                         } else if(i < MAX_SKB_FRAGS) {
1038                                 if (copy > PAGE_SIZE)
1039                                         copy = PAGE_SIZE;
1040                                 page = alloc_pages(sk->sk_allocation, 0);
1041                                 if (page == NULL) {
1042                                         err = -ENOMEM;
1043                                         goto error;
1044                                 }
1045                                 sk->sk_sndmsg_page = page;
1046                                 sk->sk_sndmsg_off = 0;
1047
1048                                 skb_fill_page_desc(skb, i, page, 0, 0);
1049                                 frag = &skb_shinfo(skb)->frags[i];
1050                                 skb->truesize += PAGE_SIZE;
1051                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1052                         } else {
1053                                 err = -EMSGSIZE;
1054                                 goto error;
1055                         }
1056                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1057                                 err = -EFAULT;
1058                                 goto error;
1059                         }
1060                         sk->sk_sndmsg_off += copy;
1061                         frag->size += copy;
1062                         skb->len += copy;
1063                         skb->data_len += copy;
1064                 }
1065                 offset += copy;
1066                 length -= copy;
1067         }
1068         return 0;
1069 error:
1070         inet->cork.length -= length;
1071         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1072         return err;
1073 }
1074
1075 int ip6_push_pending_frames(struct sock *sk)
1076 {
1077         struct sk_buff *skb, *tmp_skb;
1078         struct sk_buff **tail_skb;
1079         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1080         struct inet_sock *inet = inet_sk(sk);
1081         struct ipv6_pinfo *np = inet6_sk(sk);
1082         struct ipv6hdr *hdr;
1083         struct ipv6_txoptions *opt = np->cork.opt;
1084         struct rt6_info *rt = np->cork.rt;
1085         struct flowi *fl = &inet->cork.fl;
1086         unsigned char proto = fl->proto;
1087         int err = 0;
1088
1089         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1090                 goto out;
1091         tail_skb = &(skb_shinfo(skb)->frag_list);
1092
1093         /* move skb->data to ip header from ext header */
1094         if (skb->data < skb->nh.raw)
1095                 __skb_pull(skb, skb->nh.raw - skb->data);
1096         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1097                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1098                 *tail_skb = tmp_skb;
1099                 tail_skb = &(tmp_skb->next);
1100                 skb->len += tmp_skb->len;
1101                 skb->data_len += tmp_skb->len;
1102                 skb->truesize += tmp_skb->truesize;
1103                 __sock_put(tmp_skb->sk);
1104                 tmp_skb->destructor = NULL;
1105                 tmp_skb->sk = NULL;
1106         }
1107
1108         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1109         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1110         if (opt && opt->opt_flen)
1111                 ipv6_push_frag_opts(skb, opt, &proto);
1112         if (opt && opt->opt_nflen)
1113                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1114
1115         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1116         
1117         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1118
1119         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1120                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1121         else
1122                 hdr->payload_len = 0;
1123         hdr->hop_limit = np->cork.hop_limit;
1124         hdr->nexthdr = proto;
1125         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1126         ipv6_addr_copy(&hdr->daddr, final_dst);
1127
1128         skb->dst = dst_clone(&rt->u.dst);
1129         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1130         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1131         if (err) {
1132                 if (err > 0)
1133                         err = np->recverr ? net_xmit_errno(err) : 0;
1134                 if (err)
1135                         goto error;
1136         }
1137
1138 out:
1139         inet->cork.flags &= ~IPCORK_OPT;
1140         if (np->cork.opt) {
1141                 kfree(np->cork.opt);
1142                 np->cork.opt = NULL;
1143         }
1144         if (np->cork.rt) {
1145                 dst_release(&np->cork.rt->u.dst);
1146                 np->cork.rt = NULL;
1147                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1148         }
1149         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1150         return err;
1151 error:
1152         goto out;
1153 }
1154
1155 void ip6_flush_pending_frames(struct sock *sk)
1156 {
1157         struct inet_sock *inet = inet_sk(sk);
1158         struct ipv6_pinfo *np = inet6_sk(sk);
1159         struct sk_buff *skb;
1160
1161         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1162                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1163                 kfree_skb(skb);
1164         }
1165
1166         inet->cork.flags &= ~IPCORK_OPT;
1167
1168         if (np->cork.opt) {
1169                 kfree(np->cork.opt);
1170                 np->cork.opt = NULL;
1171         }
1172         if (np->cork.rt) {
1173                 dst_release(&np->cork.rt->u.dst);
1174                 np->cork.rt = NULL;
1175                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1176         }
1177         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1178 }