2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
37 #include <net/protocol.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
58 1. The most important issue is detecting local dead loops.
59 They would cause complete host lockup in transmit, which
60 would be "resolved" by stack overflow or, if queueing is enabled,
61 with infinite looping in net_bh.
63 We cannot track such dead loops during route installation,
64 it is infeasible task. The most general solutions would be
65 to keep skb->encapsulation counter (sort of local ttl),
66 and silently drop packet when it expires. It is the best
67 solution, but it supposes maintaing new variable in ALL
68 skb, even if no tunneling is used.
70 Current solution: HARD_TX_LOCK lock breaks dead loops.
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
122 static void ipgre_tunnel_setup(struct net_device *dev);
123 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 /* Fallback tunnel: no source, no destination, no key, no options */
129 static int ipgre_net_id __read_mostly;
131 struct ip_tunnel *tunnels[4][HASH_SIZE];
133 struct net_device *fb_tunnel_dev;
136 /* Tunnel hash table */
146 We require exact key match i.e. if a key is present in packet
147 it will match only tunnel with the same key; if it is not present,
148 it will match only keyless tunnel.
150 All keysless packets, if not matched configured keyless tunnels
151 will match fallback tunnel.
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 #define tunnels_r_l tunnels[3]
157 #define tunnels_r tunnels[2]
158 #define tunnels_l tunnels[1]
159 #define tunnels_wc tunnels[0]
161 * Locking : hash tables are protected by RCU and a spinlock
163 static DEFINE_SPINLOCK(ipgre_lock);
165 #define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
168 /* Given src, dst and key, find appropriate for input tunnel. */
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171 __be32 remote, __be32 local,
172 __be32 key, __be16 gre_proto)
174 struct net *net = dev_net(dev);
175 int link = dev->ifindex;
176 unsigned h0 = HASH(remote);
177 unsigned h1 = HASH(key);
178 struct ip_tunnel *t, *cand = NULL;
179 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181 ARPHRD_ETHER : ARPHRD_IPGRE;
182 int score, cand_score = 4;
184 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185 if (local != t->parms.iph.saddr ||
186 remote != t->parms.iph.daddr ||
187 key != t->parms.i_key ||
188 !(t->dev->flags & IFF_UP))
191 if (t->dev->type != ARPHRD_IPGRE &&
192 t->dev->type != dev_type)
196 if (t->parms.link != link)
198 if (t->dev->type != dev_type)
203 if (score < cand_score) {
209 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210 if (remote != t->parms.iph.daddr ||
211 key != t->parms.i_key ||
212 !(t->dev->flags & IFF_UP))
215 if (t->dev->type != ARPHRD_IPGRE &&
216 t->dev->type != dev_type)
220 if (t->parms.link != link)
222 if (t->dev->type != dev_type)
227 if (score < cand_score) {
233 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234 if ((local != t->parms.iph.saddr &&
235 (local != t->parms.iph.daddr ||
236 !ipv4_is_multicast(local))) ||
237 key != t->parms.i_key ||
238 !(t->dev->flags & IFF_UP))
241 if (t->dev->type != ARPHRD_IPGRE &&
242 t->dev->type != dev_type)
246 if (t->parms.link != link)
248 if (t->dev->type != dev_type)
253 if (score < cand_score) {
259 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260 if (t->parms.i_key != key ||
261 !(t->dev->flags & IFF_UP))
264 if (t->dev->type != ARPHRD_IPGRE &&
265 t->dev->type != dev_type)
269 if (t->parms.link != link)
271 if (t->dev->type != dev_type)
276 if (score < cand_score) {
285 dev = ign->fb_tunnel_dev;
286 if (dev->flags & IFF_UP)
287 return netdev_priv(dev);
292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
293 struct ip_tunnel_parm *parms)
295 __be32 remote = parms->iph.daddr;
296 __be32 local = parms->iph.saddr;
297 __be32 key = parms->i_key;
298 unsigned h = HASH(key);
303 if (remote && !ipv4_is_multicast(remote)) {
308 return &ign->tunnels[prio][h];
311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
314 return __ipgre_bucket(ign, &t->parms);
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
319 struct ip_tunnel **tp = ipgre_bucket(ign, t);
321 spin_lock_bh(&ipgre_lock);
323 rcu_assign_pointer(*tp, t);
324 spin_unlock_bh(&ipgre_lock);
327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
329 struct ip_tunnel **tp;
331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
333 spin_lock_bh(&ipgre_lock);
335 spin_unlock_bh(&ipgre_lock);
341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342 struct ip_tunnel_parm *parms,
345 __be32 remote = parms->iph.daddr;
346 __be32 local = parms->iph.saddr;
347 __be32 key = parms->i_key;
348 int link = parms->link;
349 struct ip_tunnel *t, **tp;
350 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
352 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
353 if (local == t->parms.iph.saddr &&
354 remote == t->parms.iph.daddr &&
355 key == t->parms.i_key &&
356 link == t->parms.link &&
357 type == t->dev->type)
363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
364 struct ip_tunnel_parm *parms, int create)
366 struct ip_tunnel *t, *nt;
367 struct net_device *dev;
369 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
371 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
376 strlcpy(name, parms->name, IFNAMSIZ);
378 sprintf(name, "gre%%d");
380 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
384 dev_net_set(dev, net);
386 if (strchr(name, '%')) {
387 if (dev_alloc_name(dev, name) < 0)
391 nt = netdev_priv(dev);
393 dev->rtnl_link_ops = &ipgre_link_ops;
395 dev->mtu = ipgre_tunnel_bind_dev(dev);
397 if (register_netdevice(dev) < 0)
401 ipgre_tunnel_link(ign, nt);
409 static void ipgre_tunnel_uninit(struct net_device *dev)
411 struct net *net = dev_net(dev);
412 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
414 ipgre_tunnel_unlink(ign, netdev_priv(dev));
419 static void ipgre_err(struct sk_buff *skb, u32 info)
422 /* All the routers (except for Linux) return only
423 8 bytes of packet payload. It means, that precise relaying of
424 ICMP in the real Internet is absolutely infeasible.
426 Moreover, Cisco "wise men" put GRE key to the third word
427 in GRE header. It makes impossible maintaining even soft state for keyed
428 GRE tunnels with enabled checksum. Tell them "thank you".
430 Well, I wonder, rfc1812 was written by Cisco employee,
431 what the hell these idiots break standrads established
435 struct iphdr *iph = (struct iphdr *)skb->data;
436 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
437 int grehlen = (iph->ihl<<2) + 4;
438 const int type = icmp_hdr(skb)->type;
439 const int code = icmp_hdr(skb)->code;
444 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
445 if (flags&(GRE_VERSION|GRE_ROUTING))
454 /* If only 8 bytes returned, keyed message will be dropped here */
455 if (skb_headlen(skb) < grehlen)
460 case ICMP_PARAMETERPROB:
463 case ICMP_DEST_UNREACH:
466 case ICMP_PORT_UNREACH:
467 /* Impossible event. */
469 case ICMP_FRAG_NEEDED:
470 /* Soft state for pmtu is maintained by IP core. */
473 /* All others are translated to HOST_UNREACH.
474 rfc2003 contains "deep thoughts" about NET_UNREACH,
475 I believe they are just ether pollution. --ANK
480 case ICMP_TIME_EXCEEDED:
481 if (code != ICMP_EXC_TTL)
487 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
489 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
491 if (t == NULL || t->parms.iph.daddr == 0 ||
492 ipv4_is_multicast(t->parms.iph.daddr))
495 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
498 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
502 t->err_time = jiffies;
508 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
510 if (INET_ECN_is_ce(iph->tos)) {
511 if (skb->protocol == htons(ETH_P_IP)) {
512 IP_ECN_set_ce(ip_hdr(skb));
513 } else if (skb->protocol == htons(ETH_P_IPV6)) {
514 IP6_ECN_set_ce(ipv6_hdr(skb));
520 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
523 if (skb->protocol == htons(ETH_P_IP))
524 inner = old_iph->tos;
525 else if (skb->protocol == htons(ETH_P_IPV6))
526 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
527 return INET_ECN_encapsulate(tos, inner);
530 static int ipgre_rcv(struct sk_buff *skb)
538 struct ip_tunnel *tunnel;
542 if (!pskb_may_pull(skb, 16))
549 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550 /* - Version must be 0.
551 - We do not support routing headers.
553 if (flags&(GRE_VERSION|GRE_ROUTING))
556 if (flags&GRE_CSUM) {
557 switch (skb->ip_summed) {
558 case CHECKSUM_COMPLETE:
559 csum = csum_fold(skb->csum);
565 csum = __skb_checksum_complete(skb);
566 skb->ip_summed = CHECKSUM_COMPLETE;
571 key = *(__be32*)(h + offset);
575 seqno = ntohl(*(__be32*)(h + offset));
580 gre_proto = *(__be16 *)(h + 2);
583 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
584 iph->saddr, iph->daddr, key,
586 struct net_device_stats *stats = &tunnel->dev->stats;
590 skb->protocol = gre_proto;
591 /* WCCP version 1 and 2 protocol decoding.
592 * - Change protocol to IP
593 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
595 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
596 skb->protocol = htons(ETH_P_IP);
597 if ((*(h + offset) & 0xF0) != 0x40)
601 skb->mac_header = skb->network_header;
602 __pskb_pull(skb, offset);
603 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
604 skb->pkt_type = PACKET_HOST;
605 #ifdef CONFIG_NET_IPGRE_BROADCAST
606 if (ipv4_is_multicast(iph->daddr)) {
607 /* Looped back packet, drop it! */
608 if (skb_rtable(skb)->fl.iif == 0)
611 skb->pkt_type = PACKET_BROADCAST;
615 if (((flags&GRE_CSUM) && csum) ||
616 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
617 stats->rx_crc_errors++;
621 if (tunnel->parms.i_flags&GRE_SEQ) {
622 if (!(flags&GRE_SEQ) ||
623 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
624 stats->rx_fifo_errors++;
628 tunnel->i_seqno = seqno + 1;
631 /* Warning: All skb pointers will be invalidated! */
632 if (tunnel->dev->type == ARPHRD_ETHER) {
633 if (!pskb_may_pull(skb, ETH_HLEN)) {
634 stats->rx_length_errors++;
640 skb->protocol = eth_type_trans(skb, tunnel->dev);
641 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
644 skb_tunnel_rx(skb, tunnel->dev);
646 skb_reset_network_header(skb);
647 ipgre_ecn_decapsulate(iph, skb);
653 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
664 struct ip_tunnel *tunnel = netdev_priv(dev);
665 struct net_device_stats *stats = &dev->stats;
666 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
667 struct iphdr *old_iph = ip_hdr(skb);
671 struct rtable *rt; /* Route to the other host */
672 struct net_device *tdev; /* Device to other host */
673 struct iphdr *iph; /* Our new IP header */
674 unsigned int max_headroom; /* The extra header space needed */
679 if (dev->type == ARPHRD_ETHER)
680 IPCB(skb)->flags = 0;
682 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
684 tiph = (struct iphdr *)skb->data;
686 gre_hlen = tunnel->hlen;
687 tiph = &tunnel->parms.iph;
690 if ((dst = tiph->daddr) == 0) {
693 if (skb_dst(skb) == NULL) {
694 stats->tx_fifo_errors++;
698 if (skb->protocol == htons(ETH_P_IP)) {
699 rt = skb_rtable(skb);
700 if ((dst = rt->rt_gateway) == 0)
704 else if (skb->protocol == htons(ETH_P_IPV6)) {
705 struct in6_addr *addr6;
707 struct neighbour *neigh = skb_dst(skb)->neighbour;
712 addr6 = (struct in6_addr *)&neigh->primary_key;
713 addr_type = ipv6_addr_type(addr6);
715 if (addr_type == IPV6_ADDR_ANY) {
716 addr6 = &ipv6_hdr(skb)->daddr;
717 addr_type = ipv6_addr_type(addr6);
720 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
723 dst = addr6->s6_addr32[3];
733 if (skb->protocol == htons(ETH_P_IP))
738 struct flowi fl = { .oif = tunnel->parms.link,
741 .saddr = tiph->saddr,
742 .tos = RT_TOS(tos) } },
743 .proto = IPPROTO_GRE };
744 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
745 stats->tx_carrier_errors++;
749 tdev = rt->u.dst.dev;
759 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
761 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
764 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
766 if (skb->protocol == htons(ETH_P_IP)) {
767 df |= (old_iph->frag_off&htons(IP_DF));
769 if ((old_iph->frag_off&htons(IP_DF)) &&
770 mtu < ntohs(old_iph->tot_len)) {
771 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
777 else if (skb->protocol == htons(ETH_P_IPV6)) {
778 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
780 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
781 if ((tunnel->parms.iph.daddr &&
782 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
783 rt6->rt6i_dst.plen == 128) {
784 rt6->rt6i_flags |= RTF_MODIFIED;
785 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
789 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
790 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
797 if (tunnel->err_count > 0) {
798 if (time_before(jiffies,
799 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
802 dst_link_failure(skb);
804 tunnel->err_count = 0;
807 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
809 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
810 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
811 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
812 if (max_headroom > dev->needed_headroom)
813 dev->needed_headroom = max_headroom;
821 skb_set_owner_w(new_skb, skb->sk);
824 old_iph = ip_hdr(skb);
827 skb_reset_transport_header(skb);
828 skb_push(skb, gre_hlen);
829 skb_reset_network_header(skb);
830 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
831 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
834 skb_dst_set(skb, &rt->u.dst);
837 * Push down and install the IPIP header.
842 iph->ihl = sizeof(struct iphdr) >> 2;
844 iph->protocol = IPPROTO_GRE;
845 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
846 iph->daddr = rt->rt_dst;
847 iph->saddr = rt->rt_src;
849 if ((iph->ttl = tiph->ttl) == 0) {
850 if (skb->protocol == htons(ETH_P_IP))
851 iph->ttl = old_iph->ttl;
853 else if (skb->protocol == htons(ETH_P_IPV6))
854 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
857 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
860 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
861 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
862 htons(ETH_P_TEB) : skb->protocol;
864 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
865 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
867 if (tunnel->parms.o_flags&GRE_SEQ) {
869 *ptr = htonl(tunnel->o_seqno);
872 if (tunnel->parms.o_flags&GRE_KEY) {
873 *ptr = tunnel->parms.o_key;
876 if (tunnel->parms.o_flags&GRE_CSUM) {
878 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
888 dst_link_failure(skb);
896 static int ipgre_tunnel_bind_dev(struct net_device *dev)
898 struct net_device *tdev = NULL;
899 struct ip_tunnel *tunnel;
901 int hlen = LL_MAX_HEADER;
902 int mtu = ETH_DATA_LEN;
903 int addend = sizeof(struct iphdr) + 4;
905 tunnel = netdev_priv(dev);
906 iph = &tunnel->parms.iph;
908 /* Guess output device to choose reasonable mtu and needed_headroom */
911 struct flowi fl = { .oif = tunnel->parms.link,
913 { .daddr = iph->daddr,
915 .tos = RT_TOS(iph->tos) } },
916 .proto = IPPROTO_GRE };
918 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
919 tdev = rt->u.dst.dev;
923 if (dev->type != ARPHRD_ETHER)
924 dev->flags |= IFF_POINTOPOINT;
927 if (!tdev && tunnel->parms.link)
928 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
931 hlen = tdev->hard_header_len + tdev->needed_headroom;
934 dev->iflink = tunnel->parms.link;
936 /* Precalculate GRE options length */
937 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
938 if (tunnel->parms.o_flags&GRE_CSUM)
940 if (tunnel->parms.o_flags&GRE_KEY)
942 if (tunnel->parms.o_flags&GRE_SEQ)
945 dev->needed_headroom = addend + hlen;
946 mtu -= dev->hard_header_len + addend;
951 tunnel->hlen = addend;
957 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
960 struct ip_tunnel_parm p;
962 struct net *net = dev_net(dev);
963 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
968 if (dev == ign->fb_tunnel_dev) {
969 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
973 t = ipgre_tunnel_locate(net, &p, 0);
976 t = netdev_priv(dev);
977 memcpy(&p, &t->parms, sizeof(p));
978 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
985 if (!capable(CAP_NET_ADMIN))
989 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
993 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
994 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
995 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
998 p.iph.frag_off |= htons(IP_DF);
1000 if (!(p.i_flags&GRE_KEY))
1002 if (!(p.o_flags&GRE_KEY))
1005 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1007 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1009 if (t->dev != dev) {
1014 unsigned nflags = 0;
1016 t = netdev_priv(dev);
1018 if (ipv4_is_multicast(p.iph.daddr))
1019 nflags = IFF_BROADCAST;
1020 else if (p.iph.daddr)
1021 nflags = IFF_POINTOPOINT;
1023 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1027 ipgre_tunnel_unlink(ign, t);
1028 t->parms.iph.saddr = p.iph.saddr;
1029 t->parms.iph.daddr = p.iph.daddr;
1030 t->parms.i_key = p.i_key;
1031 t->parms.o_key = p.o_key;
1032 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1033 memcpy(dev->broadcast, &p.iph.daddr, 4);
1034 ipgre_tunnel_link(ign, t);
1035 netdev_state_change(dev);
1041 if (cmd == SIOCCHGTUNNEL) {
1042 t->parms.iph.ttl = p.iph.ttl;
1043 t->parms.iph.tos = p.iph.tos;
1044 t->parms.iph.frag_off = p.iph.frag_off;
1045 if (t->parms.link != p.link) {
1046 t->parms.link = p.link;
1047 dev->mtu = ipgre_tunnel_bind_dev(dev);
1048 netdev_state_change(dev);
1051 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1054 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1059 if (!capable(CAP_NET_ADMIN))
1062 if (dev == ign->fb_tunnel_dev) {
1064 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1067 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1070 if (t == netdev_priv(ign->fb_tunnel_dev))
1074 unregister_netdevice(dev);
1086 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1088 struct ip_tunnel *tunnel = netdev_priv(dev);
1090 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1096 /* Nice toy. Unfortunately, useless in real life :-)
1097 It allows to construct virtual multiprotocol broadcast "LAN"
1098 over the Internet, provided multicast routing is tuned.
1101 I have no idea was this bicycle invented before me,
1102 so that I had to set ARPHRD_IPGRE to a random value.
1103 I have an impression, that Cisco could make something similar,
1104 but this feature is apparently missing in IOS<=11.2(8).
1106 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1107 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1109 ping -t 255 224.66.66.66
1111 If nobody answers, mbone does not work.
1113 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1114 ip addr add 10.66.66.<somewhat>/24 dev Universe
1115 ifconfig Universe up
1116 ifconfig Universe add fe80::<Your_real_addr>/10
1117 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1120 ftp fec0:6666:6666::193.233.7.65
1125 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1126 unsigned short type,
1127 const void *daddr, const void *saddr, unsigned len)
1129 struct ip_tunnel *t = netdev_priv(dev);
1130 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1131 __be16 *p = (__be16*)(iph+1);
1133 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1134 p[0] = t->parms.o_flags;
1138 * Set the source hardware address.
1142 memcpy(&iph->saddr, saddr, 4);
1144 memcpy(&iph->daddr, daddr, 4);
1151 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1153 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1154 memcpy(haddr, &iph->saddr, 4);
1158 static const struct header_ops ipgre_header_ops = {
1159 .create = ipgre_header,
1160 .parse = ipgre_header_parse,
1163 #ifdef CONFIG_NET_IPGRE_BROADCAST
1164 static int ipgre_open(struct net_device *dev)
1166 struct ip_tunnel *t = netdev_priv(dev);
1168 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1169 struct flowi fl = { .oif = t->parms.link,
1171 { .daddr = t->parms.iph.daddr,
1172 .saddr = t->parms.iph.saddr,
1173 .tos = RT_TOS(t->parms.iph.tos) } },
1174 .proto = IPPROTO_GRE };
1176 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1177 return -EADDRNOTAVAIL;
1178 dev = rt->u.dst.dev;
1180 if (__in_dev_get_rtnl(dev) == NULL)
1181 return -EADDRNOTAVAIL;
1182 t->mlink = dev->ifindex;
1183 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1188 static int ipgre_close(struct net_device *dev)
1190 struct ip_tunnel *t = netdev_priv(dev);
1192 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1193 struct in_device *in_dev;
1194 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1205 static const struct net_device_ops ipgre_netdev_ops = {
1206 .ndo_init = ipgre_tunnel_init,
1207 .ndo_uninit = ipgre_tunnel_uninit,
1208 #ifdef CONFIG_NET_IPGRE_BROADCAST
1209 .ndo_open = ipgre_open,
1210 .ndo_stop = ipgre_close,
1212 .ndo_start_xmit = ipgre_tunnel_xmit,
1213 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1214 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1217 static void ipgre_tunnel_setup(struct net_device *dev)
1219 dev->netdev_ops = &ipgre_netdev_ops;
1220 dev->destructor = free_netdev;
1222 dev->type = ARPHRD_IPGRE;
1223 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1224 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1225 dev->flags = IFF_NOARP;
1228 dev->features |= NETIF_F_NETNS_LOCAL;
1229 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1232 static int ipgre_tunnel_init(struct net_device *dev)
1234 struct ip_tunnel *tunnel;
1237 tunnel = netdev_priv(dev);
1238 iph = &tunnel->parms.iph;
1241 strcpy(tunnel->parms.name, dev->name);
1243 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1244 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1247 #ifdef CONFIG_NET_IPGRE_BROADCAST
1248 if (ipv4_is_multicast(iph->daddr)) {
1251 dev->flags = IFF_BROADCAST;
1252 dev->header_ops = &ipgre_header_ops;
1256 dev->header_ops = &ipgre_header_ops;
1261 static void ipgre_fb_tunnel_init(struct net_device *dev)
1263 struct ip_tunnel *tunnel = netdev_priv(dev);
1264 struct iphdr *iph = &tunnel->parms.iph;
1265 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1268 strcpy(tunnel->parms.name, dev->name);
1271 iph->protocol = IPPROTO_GRE;
1273 tunnel->hlen = sizeof(struct iphdr) + 4;
1276 ign->tunnels_wc[0] = tunnel;
1280 static const struct net_protocol ipgre_protocol = {
1281 .handler = ipgre_rcv,
1282 .err_handler = ipgre_err,
1286 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1290 for (prio = 0; prio < 4; prio++) {
1292 for (h = 0; h < HASH_SIZE; h++) {
1293 struct ip_tunnel *t = ign->tunnels[prio][h];
1296 unregister_netdevice_queue(t->dev, head);
1303 static int __net_init ipgre_init_net(struct net *net)
1305 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1308 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1309 ipgre_tunnel_setup);
1310 if (!ign->fb_tunnel_dev) {
1314 dev_net_set(ign->fb_tunnel_dev, net);
1316 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1317 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1319 if ((err = register_netdev(ign->fb_tunnel_dev)))
1325 free_netdev(ign->fb_tunnel_dev);
1330 static void __net_exit ipgre_exit_net(struct net *net)
1332 struct ipgre_net *ign;
1335 ign = net_generic(net, ipgre_net_id);
1337 ipgre_destroy_tunnels(ign, &list);
1338 unregister_netdevice_many(&list);
1342 static struct pernet_operations ipgre_net_ops = {
1343 .init = ipgre_init_net,
1344 .exit = ipgre_exit_net,
1345 .id = &ipgre_net_id,
1346 .size = sizeof(struct ipgre_net),
1349 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357 if (data[IFLA_GRE_IFLAGS])
1358 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1359 if (data[IFLA_GRE_OFLAGS])
1360 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1361 if (flags & (GRE_VERSION|GRE_ROUTING))
1367 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1371 if (tb[IFLA_ADDRESS]) {
1372 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1374 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1375 return -EADDRNOTAVAIL;
1381 if (data[IFLA_GRE_REMOTE]) {
1382 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1388 return ipgre_tunnel_validate(tb, data);
1391 static void ipgre_netlink_parms(struct nlattr *data[],
1392 struct ip_tunnel_parm *parms)
1394 memset(parms, 0, sizeof(*parms));
1396 parms->iph.protocol = IPPROTO_GRE;
1401 if (data[IFLA_GRE_LINK])
1402 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1404 if (data[IFLA_GRE_IFLAGS])
1405 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1407 if (data[IFLA_GRE_OFLAGS])
1408 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1410 if (data[IFLA_GRE_IKEY])
1411 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1413 if (data[IFLA_GRE_OKEY])
1414 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1416 if (data[IFLA_GRE_LOCAL])
1417 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1419 if (data[IFLA_GRE_REMOTE])
1420 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1422 if (data[IFLA_GRE_TTL])
1423 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1425 if (data[IFLA_GRE_TOS])
1426 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1428 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1429 parms->iph.frag_off = htons(IP_DF);
1432 static int ipgre_tap_init(struct net_device *dev)
1434 struct ip_tunnel *tunnel;
1436 tunnel = netdev_priv(dev);
1439 strcpy(tunnel->parms.name, dev->name);
1441 ipgre_tunnel_bind_dev(dev);
1446 static const struct net_device_ops ipgre_tap_netdev_ops = {
1447 .ndo_init = ipgre_tap_init,
1448 .ndo_uninit = ipgre_tunnel_uninit,
1449 .ndo_start_xmit = ipgre_tunnel_xmit,
1450 .ndo_set_mac_address = eth_mac_addr,
1451 .ndo_validate_addr = eth_validate_addr,
1452 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1455 static void ipgre_tap_setup(struct net_device *dev)
1460 dev->netdev_ops = &ipgre_tap_netdev_ops;
1461 dev->destructor = free_netdev;
1464 dev->features |= NETIF_F_NETNS_LOCAL;
1467 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1468 struct nlattr *data[])
1470 struct ip_tunnel *nt;
1471 struct net *net = dev_net(dev);
1472 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1476 nt = netdev_priv(dev);
1477 ipgre_netlink_parms(data, &nt->parms);
1479 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1482 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1483 random_ether_addr(dev->dev_addr);
1485 mtu = ipgre_tunnel_bind_dev(dev);
1489 err = register_netdevice(dev);
1494 ipgre_tunnel_link(ign, nt);
1500 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1501 struct nlattr *data[])
1503 struct ip_tunnel *t, *nt;
1504 struct net *net = dev_net(dev);
1505 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1506 struct ip_tunnel_parm p;
1509 if (dev == ign->fb_tunnel_dev)
1512 nt = netdev_priv(dev);
1513 ipgre_netlink_parms(data, &p);
1515 t = ipgre_tunnel_locate(net, &p, 0);
1523 if (dev->type != ARPHRD_ETHER) {
1524 unsigned nflags = 0;
1526 if (ipv4_is_multicast(p.iph.daddr))
1527 nflags = IFF_BROADCAST;
1528 else if (p.iph.daddr)
1529 nflags = IFF_POINTOPOINT;
1531 if ((dev->flags ^ nflags) &
1532 (IFF_POINTOPOINT | IFF_BROADCAST))
1536 ipgre_tunnel_unlink(ign, t);
1537 t->parms.iph.saddr = p.iph.saddr;
1538 t->parms.iph.daddr = p.iph.daddr;
1539 t->parms.i_key = p.i_key;
1540 if (dev->type != ARPHRD_ETHER) {
1541 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1542 memcpy(dev->broadcast, &p.iph.daddr, 4);
1544 ipgre_tunnel_link(ign, t);
1545 netdev_state_change(dev);
1548 t->parms.o_key = p.o_key;
1549 t->parms.iph.ttl = p.iph.ttl;
1550 t->parms.iph.tos = p.iph.tos;
1551 t->parms.iph.frag_off = p.iph.frag_off;
1553 if (t->parms.link != p.link) {
1554 t->parms.link = p.link;
1555 mtu = ipgre_tunnel_bind_dev(dev);
1558 netdev_state_change(dev);
1564 static size_t ipgre_get_size(const struct net_device *dev)
1569 /* IFLA_GRE_IFLAGS */
1571 /* IFLA_GRE_OFLAGS */
1577 /* IFLA_GRE_LOCAL */
1579 /* IFLA_GRE_REMOTE */
1585 /* IFLA_GRE_PMTUDISC */
1590 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1592 struct ip_tunnel *t = netdev_priv(dev);
1593 struct ip_tunnel_parm *p = &t->parms;
1595 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1596 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1597 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1598 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1599 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1600 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1601 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1602 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1603 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1604 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1612 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1613 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1614 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1615 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1616 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1617 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1618 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1619 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1620 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1621 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1622 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1625 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1627 .maxtype = IFLA_GRE_MAX,
1628 .policy = ipgre_policy,
1629 .priv_size = sizeof(struct ip_tunnel),
1630 .setup = ipgre_tunnel_setup,
1631 .validate = ipgre_tunnel_validate,
1632 .newlink = ipgre_newlink,
1633 .changelink = ipgre_changelink,
1634 .get_size = ipgre_get_size,
1635 .fill_info = ipgre_fill_info,
1638 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1640 .maxtype = IFLA_GRE_MAX,
1641 .policy = ipgre_policy,
1642 .priv_size = sizeof(struct ip_tunnel),
1643 .setup = ipgre_tap_setup,
1644 .validate = ipgre_tap_validate,
1645 .newlink = ipgre_newlink,
1646 .changelink = ipgre_changelink,
1647 .get_size = ipgre_get_size,
1648 .fill_info = ipgre_fill_info,
1652 * And now the modules code and kernel interface.
1655 static int __init ipgre_init(void)
1659 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1661 err = register_pernet_device(&ipgre_net_ops);
1665 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1667 printk(KERN_INFO "ipgre init: can't add protocol\n");
1668 goto add_proto_failed;
1671 err = rtnl_link_register(&ipgre_link_ops);
1673 goto rtnl_link_failed;
1675 err = rtnl_link_register(&ipgre_tap_ops);
1677 goto tap_ops_failed;
1683 rtnl_link_unregister(&ipgre_link_ops);
1685 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1687 unregister_pernet_device(&ipgre_net_ops);
1691 static void __exit ipgre_fini(void)
1693 rtnl_link_unregister(&ipgre_tap_ops);
1694 rtnl_link_unregister(&ipgre_link_ops);
1695 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1696 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1697 unregister_pernet_device(&ipgre_net_ops);
1700 module_init(ipgre_init);
1701 module_exit(ipgre_fini);
1702 MODULE_LICENSE("GPL");
1703 MODULE_ALIAS_RTNL_LINK("gre");
1704 MODULE_ALIAS_RTNL_LINK("gretap");