2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 int __ip6_local_out(struct sk_buff *skb)
64 len = skb->len - sizeof(struct ipv6hdr);
65 if (len > IPV6_MAXPLEN)
67 ipv6_hdr(skb)->payload_len = htons(len);
69 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
73 int ip6_local_out(struct sk_buff *skb)
77 err = __ip6_local_out(skb);
79 err = dst_output(skb);
83 EXPORT_SYMBOL_GPL(ip6_local_out);
85 static int ip6_output_finish(struct sk_buff *skb)
87 struct dst_entry *dst = skb_dst(skb);
90 return neigh_hh_output(dst->hh, skb);
91 else if (dst->neighbour)
92 return dst->neighbour->output(skb);
94 IP6_INC_STATS_BH(dev_net(dst->dev),
95 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
101 /* dev_loopback_xmit for use with netfilter. */
102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
104 skb_reset_mac_header(newskb);
105 __skb_pull(newskb, skb_network_offset(newskb));
106 newskb->pkt_type = PACKET_LOOPBACK;
107 newskb->ip_summed = CHECKSUM_UNNECESSARY;
108 WARN_ON(!skb_dst(newskb));
115 static int ip6_output2(struct sk_buff *skb)
117 struct dst_entry *dst = skb_dst(skb);
118 struct net_device *dev = dst->dev;
120 skb->protocol = htons(ETH_P_IPV6);
123 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
124 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
126 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
127 ((mroute6_socket(dev_net(dev)) &&
128 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
129 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
130 &ipv6_hdr(skb)->saddr))) {
131 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133 /* Do not check for IFF_ALLMULTI; multicast routing
134 is not supported in any case.
137 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139 ip6_dev_loopback_xmit);
141 if (ipv6_hdr(skb)->hop_limit == 0) {
142 IP6_INC_STATS(dev_net(dev), idev,
143 IPSTATS_MIB_OUTDISCARDS);
149 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
153 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
157 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
162 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
165 int ip6_output(struct sk_buff *skb)
167 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
168 if (unlikely(idev->cnf.disable_ipv6)) {
169 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
170 IPSTATS_MIB_OUTDISCARDS);
175 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
176 dst_allfrag(skb_dst(skb)))
177 return ip6_fragment(skb, ip6_output2);
179 return ip6_output2(skb);
183 * xmit an sk_buff (used by TCP)
186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
187 struct ipv6_txoptions *opt, int ipfragok)
189 struct net *net = sock_net(sk);
190 struct ipv6_pinfo *np = inet6_sk(sk);
191 struct in6_addr *first_hop = &fl->fl6_dst;
192 struct dst_entry *dst = skb_dst(skb);
194 u8 proto = fl->proto;
195 int seg_len = skb->len;
201 unsigned int head_room;
203 /* First: exthdrs may take lots of space (~8K for now)
204 MAX_HEADER is not enough.
206 head_room = opt->opt_nflen + opt->opt_flen;
207 seg_len += head_room;
208 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210 if (skb_headroom(skb) < head_room) {
211 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
213 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
214 IPSTATS_MIB_OUTDISCARDS);
221 skb_set_owner_w(skb, sk);
224 ipv6_push_frag_opts(skb, opt, &proto);
226 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
229 skb_push(skb, sizeof(struct ipv6hdr));
230 skb_reset_network_header(skb);
233 /* Allow local fragmentation. */
238 * Fill in the IPv6 header
242 hlimit = np->hop_limit;
245 hlimit = ip6_dst_hoplimit(dst);
247 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
249 hdr->payload_len = htons(seg_len);
250 hdr->nexthdr = proto;
251 hdr->hop_limit = hlimit;
253 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
254 ipv6_addr_copy(&hdr->daddr, first_hop);
256 skb->priority = sk->sk_priority;
257 skb->mark = sk->sk_mark;
260 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
261 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
262 IPSTATS_MIB_OUT, skb->len);
263 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
268 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
270 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
271 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
276 EXPORT_SYMBOL(ip6_xmit);
279 * To avoid extra problems ND packets are send through this
280 * routine. It's code duplication but I really want to avoid
281 * extra checks since ipv6_build_header is used by TCP (which
282 * is for us performance critical)
285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
286 const struct in6_addr *saddr, const struct in6_addr *daddr,
289 struct ipv6_pinfo *np = inet6_sk(sk);
293 skb->protocol = htons(ETH_P_IPV6);
296 totlen = len + sizeof(struct ipv6hdr);
298 skb_reset_network_header(skb);
299 skb_put(skb, sizeof(struct ipv6hdr));
302 *(__be32*)hdr = htonl(0x60000000);
304 hdr->payload_len = htons(len);
305 hdr->nexthdr = proto;
306 hdr->hop_limit = np->hop_limit;
308 ipv6_addr_copy(&hdr->saddr, saddr);
309 ipv6_addr_copy(&hdr->daddr, daddr);
314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
316 struct ip6_ra_chain *ra;
317 struct sock *last = NULL;
319 read_lock(&ip6_ra_lock);
320 for (ra = ip6_ra_chain; ra; ra = ra->next) {
321 struct sock *sk = ra->sk;
322 if (sk && ra->sel == sel &&
323 (!sk->sk_bound_dev_if ||
324 sk->sk_bound_dev_if == skb->dev->ifindex)) {
326 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
328 rawv6_rcv(last, skb2);
335 rawv6_rcv(last, skb);
336 read_unlock(&ip6_ra_lock);
339 read_unlock(&ip6_ra_lock);
343 static int ip6_forward_proxy_check(struct sk_buff *skb)
345 struct ipv6hdr *hdr = ipv6_hdr(skb);
346 u8 nexthdr = hdr->nexthdr;
349 if (ipv6_ext_hdr(nexthdr)) {
350 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
354 offset = sizeof(struct ipv6hdr);
356 if (nexthdr == IPPROTO_ICMPV6) {
357 struct icmp6hdr *icmp6;
359 if (!pskb_may_pull(skb, (skb_network_header(skb) +
360 offset + 1 - skb->data)))
363 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
365 switch (icmp6->icmp6_type) {
366 case NDISC_ROUTER_SOLICITATION:
367 case NDISC_ROUTER_ADVERTISEMENT:
368 case NDISC_NEIGHBOUR_SOLICITATION:
369 case NDISC_NEIGHBOUR_ADVERTISEMENT:
371 /* For reaction involving unicast neighbor discovery
372 * message destined to the proxied address, pass it to
382 * The proxying router can't forward traffic sent to a link-local
383 * address, so signal the sender and discard the packet. This
384 * behavior is clarified by the MIPv6 specification.
386 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
387 dst_link_failure(skb);
394 static inline int ip6_forward_finish(struct sk_buff *skb)
396 return dst_output(skb);
399 int ip6_forward(struct sk_buff *skb)
401 struct dst_entry *dst = skb_dst(skb);
402 struct ipv6hdr *hdr = ipv6_hdr(skb);
403 struct inet6_skb_parm *opt = IP6CB(skb);
404 struct net *net = dev_net(dst->dev);
406 if (net->ipv6.devconf_all->forwarding == 0)
409 if (skb_warn_if_lro(skb))
412 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
413 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
417 skb_forward_csum(skb);
420 * We DO NOT make any processing on
421 * RA packets, pushing them to user level AS IS
422 * without ane WARRANTY that application will be able
423 * to interpret them. The reason is that we
424 * cannot make anything clever here.
426 * We are not end-node, so that if packet contains
427 * AH/ESP, we cannot make anything.
428 * Defragmentation also would be mistake, RA packets
429 * cannot be fragmented, because there is no warranty
430 * that different fragments will go along one path. --ANK
433 u8 *ptr = skb_network_header(skb) + opt->ra;
434 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
439 * check and decrement ttl
441 if (hdr->hop_limit <= 1) {
442 /* Force OUTPUT device used as source address */
444 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
445 IP6_INC_STATS_BH(net,
446 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
452 /* XXX: idev->cnf.proxy_ndp? */
453 if (net->ipv6.devconf_all->proxy_ndp &&
454 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
455 int proxied = ip6_forward_proxy_check(skb);
457 return ip6_input(skb);
458 else if (proxied < 0) {
459 IP6_INC_STATS(net, ip6_dst_idev(dst),
460 IPSTATS_MIB_INDISCARDS);
465 if (!xfrm6_route_forward(skb)) {
466 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
471 /* IPv6 specs say nothing about it, but it is clear that we cannot
472 send redirects to source routed frames.
473 We don't send redirects to frames decapsulated from IPsec.
475 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
476 !skb_sec_path(skb)) {
477 struct in6_addr *target = NULL;
479 struct neighbour *n = dst->neighbour;
482 * incoming and outgoing devices are the same
486 rt = (struct rt6_info *) dst;
487 if ((rt->rt6i_flags & RTF_GATEWAY))
488 target = (struct in6_addr*)&n->primary_key;
490 target = &hdr->daddr;
492 /* Limit redirects both by destination (here)
493 and by source (inside ndisc_send_redirect)
495 if (xrlim_allow(dst, 1*HZ))
496 ndisc_send_redirect(skb, n, target);
498 int addrtype = ipv6_addr_type(&hdr->saddr);
500 /* This check is security critical. */
501 if (addrtype == IPV6_ADDR_ANY ||
502 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
504 if (addrtype & IPV6_ADDR_LINKLOCAL) {
505 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
506 ICMPV6_NOT_NEIGHBOUR, 0);
511 if (skb->len > dst_mtu(dst)) {
512 /* Again, force OUTPUT device used as source address */
514 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst));
515 IP6_INC_STATS_BH(net,
516 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
517 IP6_INC_STATS_BH(net,
518 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
523 if (skb_cow(skb, dst->dev->hard_header_len)) {
524 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
530 /* Mangling hops number delayed to point after skb COW */
534 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
535 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
539 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
545 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
547 to->pkt_type = from->pkt_type;
548 to->priority = from->priority;
549 to->protocol = from->protocol;
551 skb_dst_set(to, dst_clone(skb_dst(from)));
553 to->mark = from->mark;
555 #ifdef CONFIG_NET_SCHED
556 to->tc_index = from->tc_index;
559 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
560 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
561 to->nf_trace = from->nf_trace;
563 skb_copy_secmark(to, from);
566 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
568 u16 offset = sizeof(struct ipv6hdr);
569 struct ipv6_opt_hdr *exthdr =
570 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
571 unsigned int packet_len = skb->tail - skb->network_header;
573 *nexthdr = &ipv6_hdr(skb)->nexthdr;
575 while (offset + 1 <= packet_len) {
581 case NEXTHDR_ROUTING:
585 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
586 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
596 offset += ipv6_optlen(exthdr);
597 *nexthdr = &exthdr->nexthdr;
598 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
605 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
607 struct sk_buff *frag;
608 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
609 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
610 struct ipv6hdr *tmp_hdr;
612 unsigned int mtu, hlen, left, len;
614 int ptr, offset = 0, err=0;
615 u8 *prevhdr, nexthdr = 0;
616 struct net *net = dev_net(skb_dst(skb)->dev);
618 hlen = ip6_find_1stfragopt(skb, &prevhdr);
621 mtu = ip6_skb_dst_mtu(skb);
623 /* We must not fragment if the socket is set to force MTU discovery
624 * or if the skb it not generated by a local socket. (This last
625 * check should be redundant, but it's free.)
627 if (!skb->local_df) {
628 skb->dev = skb_dst(skb)->dev;
629 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
631 IPSTATS_MIB_FRAGFAILS);
636 if (np && np->frag_size < mtu) {
640 mtu -= hlen + sizeof(struct frag_hdr);
642 if (skb_has_frags(skb)) {
643 int first_len = skb_pagelen(skb);
646 if (first_len - hlen > mtu ||
647 ((first_len - hlen) & 7) ||
651 skb_walk_frags(skb, frag) {
652 /* Correct geometry. */
653 if (frag->len > mtu ||
654 ((frag->len & 7) && frag->next) ||
655 skb_headroom(frag) < hlen)
658 /* Partially cloned skb? */
659 if (skb_shared(frag))
665 frag->destructor = sock_wfree;
666 truesizes += frag->truesize;
672 frag = skb_shinfo(skb)->frag_list;
673 skb_frag_list_init(skb);
676 *prevhdr = NEXTHDR_FRAGMENT;
677 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
679 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
680 IPSTATS_MIB_FRAGFAILS);
684 __skb_pull(skb, hlen);
685 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
686 __skb_push(skb, hlen);
687 skb_reset_network_header(skb);
688 memcpy(skb_network_header(skb), tmp_hdr, hlen);
690 ipv6_select_ident(fh);
691 fh->nexthdr = nexthdr;
693 fh->frag_off = htons(IP6_MF);
694 frag_id = fh->identification;
696 first_len = skb_pagelen(skb);
697 skb->data_len = first_len - skb_headlen(skb);
698 skb->truesize -= truesizes;
699 skb->len = first_len;
700 ipv6_hdr(skb)->payload_len = htons(first_len -
701 sizeof(struct ipv6hdr));
703 dst_hold(&rt->u.dst);
706 /* Prepare header of the next frame,
707 * before previous one went down. */
709 frag->ip_summed = CHECKSUM_NONE;
710 skb_reset_transport_header(frag);
711 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
712 __skb_push(frag, hlen);
713 skb_reset_network_header(frag);
714 memcpy(skb_network_header(frag), tmp_hdr,
716 offset += skb->len - hlen - sizeof(struct frag_hdr);
717 fh->nexthdr = nexthdr;
719 fh->frag_off = htons(offset);
720 if (frag->next != NULL)
721 fh->frag_off |= htons(IP6_MF);
722 fh->identification = frag_id;
723 ipv6_hdr(frag)->payload_len =
725 sizeof(struct ipv6hdr));
726 ip6_copy_metadata(frag, skb);
731 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
732 IPSTATS_MIB_FRAGCREATES);
745 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
746 IPSTATS_MIB_FRAGOKS);
747 dst_release(&rt->u.dst);
757 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
758 IPSTATS_MIB_FRAGFAILS);
759 dst_release(&rt->u.dst);
764 left = skb->len - hlen; /* Space per frame */
765 ptr = hlen; /* Where to start from */
768 * Fragment the datagram.
771 *prevhdr = NEXTHDR_FRAGMENT;
774 * Keep copying data until we run out.
778 /* IF: it doesn't fit, use 'mtu' - the data space left */
781 /* IF: we are not sending upto and including the packet end
782 then align the next start on an eight byte boundary */
790 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
791 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
792 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
793 IPSTATS_MIB_FRAGFAILS);
799 * Set up data on packet
802 ip6_copy_metadata(frag, skb);
803 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
804 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
805 skb_reset_network_header(frag);
806 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
807 frag->transport_header = (frag->network_header + hlen +
808 sizeof(struct frag_hdr));
811 * Charge the memory for the fragment to any owner
815 skb_set_owner_w(frag, skb->sk);
818 * Copy the packet header into the new buffer.
820 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
823 * Build fragment header.
825 fh->nexthdr = nexthdr;
828 ipv6_select_ident(fh);
829 frag_id = fh->identification;
831 fh->identification = frag_id;
834 * Copy a block of the IP datagram.
836 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
840 fh->frag_off = htons(offset);
842 fh->frag_off |= htons(IP6_MF);
843 ipv6_hdr(frag)->payload_len = htons(frag->len -
844 sizeof(struct ipv6hdr));
850 * Put this fragment into the sending queue.
856 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
857 IPSTATS_MIB_FRAGCREATES);
859 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
860 IPSTATS_MIB_FRAGOKS);
865 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
866 IPSTATS_MIB_FRAGFAILS);
871 static inline int ip6_rt_check(struct rt6key *rt_key,
872 struct in6_addr *fl_addr,
873 struct in6_addr *addr_cache)
875 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
876 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
879 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
880 struct dst_entry *dst,
883 struct ipv6_pinfo *np = inet6_sk(sk);
884 struct rt6_info *rt = (struct rt6_info *)dst;
889 /* Yes, checking route validity in not connected
890 * case is not very simple. Take into account,
891 * that we do not support routing by source, TOS,
892 * and MSG_DONTROUTE --ANK (980726)
894 * 1. ip6_rt_check(): If route was host route,
895 * check that cached destination is current.
896 * If it is network route, we still may
897 * check its validity using saved pointer
898 * to the last used address: daddr_cache.
899 * We do not want to save whole address now,
900 * (because main consumer of this service
901 * is tcp, which has not this problem),
902 * so that the last trick works only on connected
904 * 2. oif also should be the same.
906 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
907 #ifdef CONFIG_IPV6_SUBTREES
908 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910 (fl->oif && fl->oif != dst->dev->ifindex)) {
919 static int ip6_dst_lookup_tail(struct sock *sk,
920 struct dst_entry **dst, struct flowi *fl)
923 struct net *net = sock_net(sk);
926 *dst = ip6_route_output(net, sk, fl);
928 if ((err = (*dst)->error))
929 goto out_err_release;
931 if (ipv6_addr_any(&fl->fl6_src)) {
932 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
934 sk ? inet6_sk(sk)->srcprefs : 0,
937 goto out_err_release;
940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942 * Here if the dst entry we've looked up
943 * has a neighbour entry that is in the INCOMPLETE
944 * state and the src address from the flow is
945 * marked as OPTIMISTIC, we release the found
946 * dst entry and replace it instead with the
947 * dst entry of the nexthop router
949 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
950 struct inet6_ifaddr *ifp;
954 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
957 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963 * We need to get the dst entry for the
964 * default router instead
967 memcpy(&fl_gw, fl, sizeof(struct flowi));
968 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
969 *dst = ip6_route_output(net, sk, &fl_gw);
970 if ((err = (*dst)->error))
971 goto out_err_release;
979 if (err == -ENETUNREACH)
980 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
987 * ip6_dst_lookup - perform route lookup on flow
988 * @sk: socket which provides route info
989 * @dst: pointer to dst_entry * for result
990 * @fl: flow to lookup
992 * This function performs a route lookup on the given flow.
994 * It returns zero on success, or a standard errno code on error.
996 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
999 return ip6_dst_lookup_tail(sk, dst, fl);
1001 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1004 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1005 * @sk: socket which provides the dst cache and route info
1006 * @dst: pointer to dst_entry * for result
1007 * @fl: flow to lookup
1009 * This function performs a route lookup on the given flow with the
1010 * possibility of using the cached route in the socket if it is valid.
1011 * It will take the socket dst lock when operating on the dst cache.
1012 * As a result, this function can only be used in process context.
1014 * It returns zero on success, or a standard errno code on error.
1016 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1020 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1021 *dst = ip6_sk_dst_check(sk, *dst, fl);
1024 return ip6_dst_lookup_tail(sk, dst, fl);
1026 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028 static inline int ip6_ufo_append_data(struct sock *sk,
1029 int getfrag(void *from, char *to, int offset, int len,
1030 int odd, struct sk_buff *skb),
1031 void *from, int length, int hh_len, int fragheaderlen,
1032 int transhdrlen, int mtu,unsigned int flags)
1035 struct sk_buff *skb;
1038 /* There is support for UDP large send offload by network
1039 * device, so create one single skb packet containing complete
1042 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1043 skb = sock_alloc_send_skb(sk,
1044 hh_len + fragheaderlen + transhdrlen + 20,
1045 (flags & MSG_DONTWAIT), &err);
1049 /* reserve space for Hardware header */
1050 skb_reserve(skb, hh_len);
1052 /* create space for UDP/IP header */
1053 skb_put(skb,fragheaderlen + transhdrlen);
1055 /* initialize network header pointer */
1056 skb_reset_network_header(skb);
1058 /* initialize protocol header pointer */
1059 skb->transport_header = skb->network_header + fragheaderlen;
1061 skb->ip_summed = CHECKSUM_PARTIAL;
1063 sk->sk_sndmsg_off = 0;
1066 err = skb_append_datato_frags(sk,skb, getfrag, from,
1067 (length - transhdrlen));
1069 struct frag_hdr fhdr;
1071 /* Specify the length of each IPv6 datagram fragment.
1072 * It has to be a multiple of 8.
1074 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1075 sizeof(struct frag_hdr)) & ~7;
1076 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1077 ipv6_select_ident(&fhdr);
1078 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1079 __skb_queue_tail(&sk->sk_write_queue, skb);
1083 /* There is not enough support do UPD LSO,
1084 * so follow normal path
1091 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1094 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1097 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1100 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1103 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1104 int offset, int len, int odd, struct sk_buff *skb),
1105 void *from, int length, int transhdrlen,
1106 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1107 struct rt6_info *rt, unsigned int flags)
1109 struct inet_sock *inet = inet_sk(sk);
1110 struct ipv6_pinfo *np = inet6_sk(sk);
1111 struct sk_buff *skb;
1112 unsigned int maxfraglen, fragheaderlen;
1119 int csummode = CHECKSUM_NONE;
1121 if (flags&MSG_PROBE)
1123 if (skb_queue_empty(&sk->sk_write_queue)) {
1128 if (WARN_ON(np->cork.opt))
1131 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1132 if (unlikely(np->cork.opt == NULL))
1135 np->cork.opt->tot_len = opt->tot_len;
1136 np->cork.opt->opt_flen = opt->opt_flen;
1137 np->cork.opt->opt_nflen = opt->opt_nflen;
1139 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141 if (opt->dst0opt && !np->cork.opt->dst0opt)
1144 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146 if (opt->dst1opt && !np->cork.opt->dst1opt)
1149 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151 if (opt->hopopt && !np->cork.opt->hopopt)
1154 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156 if (opt->srcrt && !np->cork.opt->srcrt)
1159 /* need source address above miyazawa*/
1161 dst_hold(&rt->u.dst);
1162 inet->cork.dst = &rt->u.dst;
1163 inet->cork.fl = *fl;
1164 np->cork.hop_limit = hlimit;
1165 np->cork.tclass = tclass;
1166 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1167 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1168 if (np->frag_size < mtu) {
1170 mtu = np->frag_size;
1172 inet->cork.fragsize = mtu;
1173 if (dst_allfrag(rt->u.dst.path))
1174 inet->cork.flags |= IPCORK_ALLFRAG;
1175 inet->cork.length = 0;
1176 sk->sk_sndmsg_page = NULL;
1177 sk->sk_sndmsg_off = 0;
1178 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1179 rt->rt6i_nfheader_len;
1180 length += exthdrlen;
1181 transhdrlen += exthdrlen;
1183 rt = (struct rt6_info *)inet->cork.dst;
1184 fl = &inet->cork.fl;
1188 mtu = inet->cork.fragsize;
1191 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1193 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1194 (opt ? opt->opt_nflen : 0);
1195 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1197 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1198 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1199 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1205 * Let's try using as much space as possible.
1206 * Use MTU if total length of the message fits into the MTU.
1207 * Otherwise, we need to reserve fragment header and
1208 * fragment alignment (= 8-15 octects, in total).
1210 * Note that we may need to "move" the data from the tail of
1211 * of the buffer to the new fragment when we split
1214 * FIXME: It may be fragmented into multiple chunks
1215 * at once if non-fragmentable extension headers
1220 inet->cork.length += length;
1221 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1222 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1224 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1225 fragheaderlen, transhdrlen, mtu,
1232 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1235 while (length > 0) {
1236 /* Check if the remaining data fits into current packet. */
1237 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1239 copy = maxfraglen - skb->len;
1243 unsigned int datalen;
1244 unsigned int fraglen;
1245 unsigned int fraggap;
1246 unsigned int alloclen;
1247 struct sk_buff *skb_prev;
1251 /* There's no room in the current skb */
1253 fraggap = skb_prev->len - maxfraglen;
1258 * If remaining data exceeds the mtu,
1259 * we know we need more fragment(s).
1261 datalen = length + fraggap;
1262 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1263 datalen = maxfraglen - fragheaderlen;
1265 fraglen = datalen + fragheaderlen;
1266 if ((flags & MSG_MORE) &&
1267 !(rt->u.dst.dev->features&NETIF_F_SG))
1270 alloclen = datalen + fragheaderlen;
1273 * The last fragment gets additional space at tail.
1274 * Note: we overallocate on fragments with MSG_MODE
1275 * because we have no idea if we're the last one.
1277 if (datalen == length + fraggap)
1278 alloclen += rt->u.dst.trailer_len;
1281 * We just reserve space for fragment header.
1282 * Note: this may be overallocation if the message
1283 * (without MSG_MORE) fits into the MTU.
1285 alloclen += sizeof(struct frag_hdr);
1288 skb = sock_alloc_send_skb(sk,
1290 (flags & MSG_DONTWAIT), &err);
1293 if (atomic_read(&sk->sk_wmem_alloc) <=
1295 skb = sock_wmalloc(sk,
1296 alloclen + hh_len, 1,
1298 if (unlikely(skb == NULL))
1304 * Fill in the control structures
1306 skb->ip_summed = csummode;
1308 /* reserve for fragmentation */
1309 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1312 * Find where to start putting bytes
1314 data = skb_put(skb, fraglen);
1315 skb_set_network_header(skb, exthdrlen);
1316 data += fragheaderlen;
1317 skb->transport_header = (skb->network_header +
1320 skb->csum = skb_copy_and_csum_bits(
1321 skb_prev, maxfraglen,
1322 data + transhdrlen, fraggap, 0);
1323 skb_prev->csum = csum_sub(skb_prev->csum,
1326 pskb_trim_unique(skb_prev, maxfraglen);
1328 copy = datalen - transhdrlen - fraggap;
1333 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1340 length -= datalen - fraggap;
1343 csummode = CHECKSUM_NONE;
1346 * Put the packet on the pending queue
1348 __skb_queue_tail(&sk->sk_write_queue, skb);
1355 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1359 if (getfrag(from, skb_put(skb, copy),
1360 offset, copy, off, skb) < 0) {
1361 __skb_trim(skb, off);
1366 int i = skb_shinfo(skb)->nr_frags;
1367 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1368 struct page *page = sk->sk_sndmsg_page;
1369 int off = sk->sk_sndmsg_off;
1372 if (page && (left = PAGE_SIZE - off) > 0) {
1375 if (page != frag->page) {
1376 if (i == MAX_SKB_FRAGS) {
1381 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1382 frag = &skb_shinfo(skb)->frags[i];
1384 } else if(i < MAX_SKB_FRAGS) {
1385 if (copy > PAGE_SIZE)
1387 page = alloc_pages(sk->sk_allocation, 0);
1392 sk->sk_sndmsg_page = page;
1393 sk->sk_sndmsg_off = 0;
1395 skb_fill_page_desc(skb, i, page, 0, 0);
1396 frag = &skb_shinfo(skb)->frags[i];
1401 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1405 sk->sk_sndmsg_off += copy;
1408 skb->data_len += copy;
1409 skb->truesize += copy;
1410 atomic_add(copy, &sk->sk_wmem_alloc);
1417 inet->cork.length -= length;
1418 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1422 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1425 kfree(np->cork.opt->dst0opt);
1426 kfree(np->cork.opt->dst1opt);
1427 kfree(np->cork.opt->hopopt);
1428 kfree(np->cork.opt->srcrt);
1429 kfree(np->cork.opt);
1430 np->cork.opt = NULL;
1433 if (inet->cork.dst) {
1434 dst_release(inet->cork.dst);
1435 inet->cork.dst = NULL;
1436 inet->cork.flags &= ~IPCORK_ALLFRAG;
1438 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1441 int ip6_push_pending_frames(struct sock *sk)
1443 struct sk_buff *skb, *tmp_skb;
1444 struct sk_buff **tail_skb;
1445 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1446 struct inet_sock *inet = inet_sk(sk);
1447 struct ipv6_pinfo *np = inet6_sk(sk);
1448 struct net *net = sock_net(sk);
1449 struct ipv6hdr *hdr;
1450 struct ipv6_txoptions *opt = np->cork.opt;
1451 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1452 struct flowi *fl = &inet->cork.fl;
1453 unsigned char proto = fl->proto;
1456 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1458 tail_skb = &(skb_shinfo(skb)->frag_list);
1460 /* move skb->data to ip header from ext header */
1461 if (skb->data < skb_network_header(skb))
1462 __skb_pull(skb, skb_network_offset(skb));
1463 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1464 __skb_pull(tmp_skb, skb_network_header_len(skb));
1465 *tail_skb = tmp_skb;
1466 tail_skb = &(tmp_skb->next);
1467 skb->len += tmp_skb->len;
1468 skb->data_len += tmp_skb->len;
1469 skb->truesize += tmp_skb->truesize;
1470 tmp_skb->destructor = NULL;
1474 /* Allow local fragmentation. */
1475 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1478 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1479 __skb_pull(skb, skb_network_header_len(skb));
1480 if (opt && opt->opt_flen)
1481 ipv6_push_frag_opts(skb, opt, &proto);
1482 if (opt && opt->opt_nflen)
1483 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1485 skb_push(skb, sizeof(struct ipv6hdr));
1486 skb_reset_network_header(skb);
1487 hdr = ipv6_hdr(skb);
1489 *(__be32*)hdr = fl->fl6_flowlabel |
1490 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1492 hdr->hop_limit = np->cork.hop_limit;
1493 hdr->nexthdr = proto;
1494 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1495 ipv6_addr_copy(&hdr->daddr, final_dst);
1497 skb->priority = sk->sk_priority;
1498 skb->mark = sk->sk_mark;
1500 skb_dst_set(skb, dst_clone(&rt->u.dst));
1501 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1502 if (proto == IPPROTO_ICMPV6) {
1503 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1505 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1506 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1509 err = ip6_local_out(skb);
1512 err = net_xmit_errno(err);
1518 ip6_cork_release(inet, np);
1521 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1525 void ip6_flush_pending_frames(struct sock *sk)
1527 struct sk_buff *skb;
1529 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1531 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1532 IPSTATS_MIB_OUTDISCARDS);
1536 ip6_cork_release(inet_sk(sk), inet6_sk(sk));