fe381d12ecdd4227e785ff54f6e0b97162914f55
[safe/jmp/linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47
48 #ifdef CONFIG_IPV6
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #endif
53
54 /*
55    Problems & solutions
56    --------------------
57
58    1. The most important issue is detecting local dead loops.
59    They would cause complete host lockup in transmit, which
60    would be "resolved" by stack overflow or, if queueing is enabled,
61    with infinite looping in net_bh.
62
63    We cannot track such dead loops during route installation,
64    it is infeasible task. The most general solutions would be
65    to keep skb->encapsulation counter (sort of local ttl),
66    and silently drop packet when it expires. It is the best
67    solution, but it supposes maintaing new variable in ALL
68    skb, even if no tunneling is used.
69
70    Current solution: HARD_TX_LOCK lock breaks dead loops.
71
72
73
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107
108
109
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116
117    Alexey Kuznetsov.
118  */
119
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
122 static void ipgre_tunnel_setup(struct net_device *dev);
123 static int ipgre_tunnel_bind_dev(struct net_device *dev);
124
125 /* Fallback tunnel: no source, no destination, no key, no options */
126
127 #define HASH_SIZE  16
128
129 static int ipgre_net_id __read_mostly;
130 struct ipgre_net {
131         struct ip_tunnel *tunnels[4][HASH_SIZE];
132
133         struct net_device *fb_tunnel_dev;
134 };
135
136 /* Tunnel hash table */
137
138 /*
139    4 hash tables:
140
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155
156 #define tunnels_r_l     tunnels[3]
157 #define tunnels_r       tunnels[2]
158 #define tunnels_l       tunnels[1]
159 #define tunnels_wc      tunnels[0]
160 /*
161  * Locking : hash tables are protected by RCU and a spinlock
162  */
163 static DEFINE_SPINLOCK(ipgre_lock);
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* Given src, dst and key, find appropriate for input tunnel. */
169
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171                                               __be32 remote, __be32 local,
172                                               __be32 key, __be16 gre_proto)
173 {
174         struct net *net = dev_net(dev);
175         int link = dev->ifindex;
176         unsigned h0 = HASH(remote);
177         unsigned h1 = HASH(key);
178         struct ip_tunnel *t, *cand = NULL;
179         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181                        ARPHRD_ETHER : ARPHRD_IPGRE;
182         int score, cand_score = 4;
183
184         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185                 if (local != t->parms.iph.saddr ||
186                     remote != t->parms.iph.daddr ||
187                     key != t->parms.i_key ||
188                     !(t->dev->flags & IFF_UP))
189                         continue;
190
191                 if (t->dev->type != ARPHRD_IPGRE &&
192                     t->dev->type != dev_type)
193                         continue;
194
195                 score = 0;
196                 if (t->parms.link != link)
197                         score |= 1;
198                 if (t->dev->type != dev_type)
199                         score |= 2;
200                 if (score == 0)
201                         return t;
202
203                 if (score < cand_score) {
204                         cand = t;
205                         cand_score = score;
206                 }
207         }
208
209         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210                 if (remote != t->parms.iph.daddr ||
211                     key != t->parms.i_key ||
212                     !(t->dev->flags & IFF_UP))
213                         continue;
214
215                 if (t->dev->type != ARPHRD_IPGRE &&
216                     t->dev->type != dev_type)
217                         continue;
218
219                 score = 0;
220                 if (t->parms.link != link)
221                         score |= 1;
222                 if (t->dev->type != dev_type)
223                         score |= 2;
224                 if (score == 0)
225                         return t;
226
227                 if (score < cand_score) {
228                         cand = t;
229                         cand_score = score;
230                 }
231         }
232
233         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234                 if ((local != t->parms.iph.saddr &&
235                      (local != t->parms.iph.daddr ||
236                       !ipv4_is_multicast(local))) ||
237                     key != t->parms.i_key ||
238                     !(t->dev->flags & IFF_UP))
239                         continue;
240
241                 if (t->dev->type != ARPHRD_IPGRE &&
242                     t->dev->type != dev_type)
243                         continue;
244
245                 score = 0;
246                 if (t->parms.link != link)
247                         score |= 1;
248                 if (t->dev->type != dev_type)
249                         score |= 2;
250                 if (score == 0)
251                         return t;
252
253                 if (score < cand_score) {
254                         cand = t;
255                         cand_score = score;
256                 }
257         }
258
259         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260                 if (t->parms.i_key != key ||
261                     !(t->dev->flags & IFF_UP))
262                         continue;
263
264                 if (t->dev->type != ARPHRD_IPGRE &&
265                     t->dev->type != dev_type)
266                         continue;
267
268                 score = 0;
269                 if (t->parms.link != link)
270                         score |= 1;
271                 if (t->dev->type != dev_type)
272                         score |= 2;
273                 if (score == 0)
274                         return t;
275
276                 if (score < cand_score) {
277                         cand = t;
278                         cand_score = score;
279                 }
280         }
281
282         if (cand != NULL)
283                 return cand;
284
285         dev = ign->fb_tunnel_dev;
286         if (dev->flags & IFF_UP)
287                 return netdev_priv(dev);
288
289         return NULL;
290 }
291
292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
293                 struct ip_tunnel_parm *parms)
294 {
295         __be32 remote = parms->iph.daddr;
296         __be32 local = parms->iph.saddr;
297         __be32 key = parms->i_key;
298         unsigned h = HASH(key);
299         int prio = 0;
300
301         if (local)
302                 prio |= 1;
303         if (remote && !ipv4_is_multicast(remote)) {
304                 prio |= 2;
305                 h ^= HASH(remote);
306         }
307
308         return &ign->tunnels[prio][h];
309 }
310
311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
312                 struct ip_tunnel *t)
313 {
314         return __ipgre_bucket(ign, &t->parms);
315 }
316
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318 {
319         struct ip_tunnel **tp = ipgre_bucket(ign, t);
320
321         spin_lock_bh(&ipgre_lock);
322         t->next = *tp;
323         rcu_assign_pointer(*tp, t);
324         spin_unlock_bh(&ipgre_lock);
325 }
326
327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328 {
329         struct ip_tunnel **tp;
330
331         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
332                 if (t == *tp) {
333                         spin_lock_bh(&ipgre_lock);
334                         *tp = t->next;
335                         spin_unlock_bh(&ipgre_lock);
336                         break;
337                 }
338         }
339 }
340
341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342                                            struct ip_tunnel_parm *parms,
343                                            int type)
344 {
345         __be32 remote = parms->iph.daddr;
346         __be32 local = parms->iph.saddr;
347         __be32 key = parms->i_key;
348         int link = parms->link;
349         struct ip_tunnel *t, **tp;
350         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351
352         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
353                 if (local == t->parms.iph.saddr &&
354                     remote == t->parms.iph.daddr &&
355                     key == t->parms.i_key &&
356                     link == t->parms.link &&
357                     type == t->dev->type)
358                         break;
359
360         return t;
361 }
362
363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
364                 struct ip_tunnel_parm *parms, int create)
365 {
366         struct ip_tunnel *t, *nt;
367         struct net_device *dev;
368         char name[IFNAMSIZ];
369         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
370
371         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
372         if (t || !create)
373                 return t;
374
375         if (parms->name[0])
376                 strlcpy(name, parms->name, IFNAMSIZ);
377         else
378                 sprintf(name, "gre%%d");
379
380         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
381         if (!dev)
382           return NULL;
383
384         dev_net_set(dev, net);
385
386         if (strchr(name, '%')) {
387                 if (dev_alloc_name(dev, name) < 0)
388                         goto failed_free;
389         }
390
391         nt = netdev_priv(dev);
392         nt->parms = *parms;
393         dev->rtnl_link_ops = &ipgre_link_ops;
394
395         dev->mtu = ipgre_tunnel_bind_dev(dev);
396
397         if (register_netdevice(dev) < 0)
398                 goto failed_free;
399
400         dev_hold(dev);
401         ipgre_tunnel_link(ign, nt);
402         return nt;
403
404 failed_free:
405         free_netdev(dev);
406         return NULL;
407 }
408
409 static void ipgre_tunnel_uninit(struct net_device *dev)
410 {
411         struct net *net = dev_net(dev);
412         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
413
414         ipgre_tunnel_unlink(ign, netdev_priv(dev));
415         dev_put(dev);
416 }
417
418
419 static void ipgre_err(struct sk_buff *skb, u32 info)
420 {
421
422 /* All the routers (except for Linux) return only
423    8 bytes of packet payload. It means, that precise relaying of
424    ICMP in the real Internet is absolutely infeasible.
425
426    Moreover, Cisco "wise men" put GRE key to the third word
427    in GRE header. It makes impossible maintaining even soft state for keyed
428    GRE tunnels with enabled checksum. Tell them "thank you".
429
430    Well, I wonder, rfc1812 was written by Cisco employee,
431    what the hell these idiots break standrads established
432    by themself???
433  */
434
435         struct iphdr *iph = (struct iphdr *)skb->data;
436         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
437         int grehlen = (iph->ihl<<2) + 4;
438         const int type = icmp_hdr(skb)->type;
439         const int code = icmp_hdr(skb)->code;
440         struct ip_tunnel *t;
441         __be16 flags;
442
443         flags = p[0];
444         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
445                 if (flags&(GRE_VERSION|GRE_ROUTING))
446                         return;
447                 if (flags&GRE_KEY) {
448                         grehlen += 4;
449                         if (flags&GRE_CSUM)
450                                 grehlen += 4;
451                 }
452         }
453
454         /* If only 8 bytes returned, keyed message will be dropped here */
455         if (skb_headlen(skb) < grehlen)
456                 return;
457
458         switch (type) {
459         default:
460         case ICMP_PARAMETERPROB:
461                 return;
462
463         case ICMP_DEST_UNREACH:
464                 switch (code) {
465                 case ICMP_SR_FAILED:
466                 case ICMP_PORT_UNREACH:
467                         /* Impossible event. */
468                         return;
469                 case ICMP_FRAG_NEEDED:
470                         /* Soft state for pmtu is maintained by IP core. */
471                         return;
472                 default:
473                         /* All others are translated to HOST_UNREACH.
474                            rfc2003 contains "deep thoughts" about NET_UNREACH,
475                            I believe they are just ether pollution. --ANK
476                          */
477                         break;
478                 }
479                 break;
480         case ICMP_TIME_EXCEEDED:
481                 if (code != ICMP_EXC_TTL)
482                         return;
483                 break;
484         }
485
486         rcu_read_lock();
487         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
488                                 flags & GRE_KEY ?
489                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
490                                 p[1]);
491         if (t == NULL || t->parms.iph.daddr == 0 ||
492             ipv4_is_multicast(t->parms.iph.daddr))
493                 goto out;
494
495         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
496                 goto out;
497
498         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
499                 t->err_count++;
500         else
501                 t->err_count = 1;
502         t->err_time = jiffies;
503 out:
504         rcu_read_unlock();
505         return;
506 }
507
508 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
509 {
510         if (INET_ECN_is_ce(iph->tos)) {
511                 if (skb->protocol == htons(ETH_P_IP)) {
512                         IP_ECN_set_ce(ip_hdr(skb));
513                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
514                         IP6_ECN_set_ce(ipv6_hdr(skb));
515                 }
516         }
517 }
518
519 static inline u8
520 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
521 {
522         u8 inner = 0;
523         if (skb->protocol == htons(ETH_P_IP))
524                 inner = old_iph->tos;
525         else if (skb->protocol == htons(ETH_P_IPV6))
526                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
527         return INET_ECN_encapsulate(tos, inner);
528 }
529
530 static int ipgre_rcv(struct sk_buff *skb)
531 {
532         struct iphdr *iph;
533         u8     *h;
534         __be16    flags;
535         __sum16   csum = 0;
536         __be32 key = 0;
537         u32    seqno = 0;
538         struct ip_tunnel *tunnel;
539         int    offset = 4;
540         __be16 gre_proto;
541         unsigned int len;
542
543         if (!pskb_may_pull(skb, 16))
544                 goto drop_nolock;
545
546         iph = ip_hdr(skb);
547         h = skb->data;
548         flags = *(__be16*)h;
549
550         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
551                 /* - Version must be 0.
552                    - We do not support routing headers.
553                  */
554                 if (flags&(GRE_VERSION|GRE_ROUTING))
555                         goto drop_nolock;
556
557                 if (flags&GRE_CSUM) {
558                         switch (skb->ip_summed) {
559                         case CHECKSUM_COMPLETE:
560                                 csum = csum_fold(skb->csum);
561                                 if (!csum)
562                                         break;
563                                 /* fall through */
564                         case CHECKSUM_NONE:
565                                 skb->csum = 0;
566                                 csum = __skb_checksum_complete(skb);
567                                 skb->ip_summed = CHECKSUM_COMPLETE;
568                         }
569                         offset += 4;
570                 }
571                 if (flags&GRE_KEY) {
572                         key = *(__be32*)(h + offset);
573                         offset += 4;
574                 }
575                 if (flags&GRE_SEQ) {
576                         seqno = ntohl(*(__be32*)(h + offset));
577                         offset += 4;
578                 }
579         }
580
581         gre_proto = *(__be16 *)(h + 2);
582
583         rcu_read_lock();
584         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
585                                           iph->saddr, iph->daddr, key,
586                                           gre_proto))) {
587                 struct net_device_stats *stats = &tunnel->dev->stats;
588
589                 secpath_reset(skb);
590
591                 skb->protocol = gre_proto;
592                 /* WCCP version 1 and 2 protocol decoding.
593                  * - Change protocol to IP
594                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
595                  */
596                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
597                         skb->protocol = htons(ETH_P_IP);
598                         if ((*(h + offset) & 0xF0) != 0x40)
599                                 offset += 4;
600                 }
601
602                 skb->mac_header = skb->network_header;
603                 __pskb_pull(skb, offset);
604                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
605                 skb->pkt_type = PACKET_HOST;
606 #ifdef CONFIG_NET_IPGRE_BROADCAST
607                 if (ipv4_is_multicast(iph->daddr)) {
608                         /* Looped back packet, drop it! */
609                         if (skb_rtable(skb)->fl.iif == 0)
610                                 goto drop;
611                         stats->multicast++;
612                         skb->pkt_type = PACKET_BROADCAST;
613                 }
614 #endif
615
616                 if (((flags&GRE_CSUM) && csum) ||
617                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
618                         stats->rx_crc_errors++;
619                         stats->rx_errors++;
620                         goto drop;
621                 }
622                 if (tunnel->parms.i_flags&GRE_SEQ) {
623                         if (!(flags&GRE_SEQ) ||
624                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
625                                 stats->rx_fifo_errors++;
626                                 stats->rx_errors++;
627                                 goto drop;
628                         }
629                         tunnel->i_seqno = seqno + 1;
630                 }
631
632                 len = skb->len;
633
634                 /* Warning: All skb pointers will be invalidated! */
635                 if (tunnel->dev->type == ARPHRD_ETHER) {
636                         if (!pskb_may_pull(skb, ETH_HLEN)) {
637                                 stats->rx_length_errors++;
638                                 stats->rx_errors++;
639                                 goto drop;
640                         }
641
642                         iph = ip_hdr(skb);
643                         skb->protocol = eth_type_trans(skb, tunnel->dev);
644                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
645                 }
646
647                 stats->rx_packets++;
648                 stats->rx_bytes += len;
649                 skb->dev = tunnel->dev;
650                 skb_dst_drop(skb);
651                 nf_reset(skb);
652
653                 skb_reset_network_header(skb);
654                 ipgre_ecn_decapsulate(iph, skb);
655
656                 netif_rx(skb);
657                 rcu_read_unlock();
658                 return(0);
659         }
660         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
661
662 drop:
663         rcu_read_unlock();
664 drop_nolock:
665         kfree_skb(skb);
666         return(0);
667 }
668
669 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
670 {
671         struct ip_tunnel *tunnel = netdev_priv(dev);
672         struct net_device_stats *stats = &dev->stats;
673         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
674         struct iphdr  *old_iph = ip_hdr(skb);
675         struct iphdr  *tiph;
676         u8     tos;
677         __be16 df;
678         struct rtable *rt;                      /* Route to the other host */
679         struct net_device *tdev;                        /* Device to other host */
680         struct iphdr  *iph;                     /* Our new IP header */
681         unsigned int max_headroom;              /* The extra header space needed */
682         int    gre_hlen;
683         __be32 dst;
684         int    mtu;
685
686         if (dev->type == ARPHRD_ETHER)
687                 IPCB(skb)->flags = 0;
688
689         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
690                 gre_hlen = 0;
691                 tiph = (struct iphdr *)skb->data;
692         } else {
693                 gre_hlen = tunnel->hlen;
694                 tiph = &tunnel->parms.iph;
695         }
696
697         if ((dst = tiph->daddr) == 0) {
698                 /* NBMA tunnel */
699
700                 if (skb_dst(skb) == NULL) {
701                         stats->tx_fifo_errors++;
702                         goto tx_error;
703                 }
704
705                 if (skb->protocol == htons(ETH_P_IP)) {
706                         rt = skb_rtable(skb);
707                         if ((dst = rt->rt_gateway) == 0)
708                                 goto tx_error_icmp;
709                 }
710 #ifdef CONFIG_IPV6
711                 else if (skb->protocol == htons(ETH_P_IPV6)) {
712                         struct in6_addr *addr6;
713                         int addr_type;
714                         struct neighbour *neigh = skb_dst(skb)->neighbour;
715
716                         if (neigh == NULL)
717                                 goto tx_error;
718
719                         addr6 = (struct in6_addr *)&neigh->primary_key;
720                         addr_type = ipv6_addr_type(addr6);
721
722                         if (addr_type == IPV6_ADDR_ANY) {
723                                 addr6 = &ipv6_hdr(skb)->daddr;
724                                 addr_type = ipv6_addr_type(addr6);
725                         }
726
727                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
728                                 goto tx_error_icmp;
729
730                         dst = addr6->s6_addr32[3];
731                 }
732 #endif
733                 else
734                         goto tx_error;
735         }
736
737         tos = tiph->tos;
738         if (tos == 1) {
739                 tos = 0;
740                 if (skb->protocol == htons(ETH_P_IP))
741                         tos = old_iph->tos;
742         }
743
744         {
745                 struct flowi fl = { .oif = tunnel->parms.link,
746                                     .nl_u = { .ip4_u =
747                                               { .daddr = dst,
748                                                 .saddr = tiph->saddr,
749                                                 .tos = RT_TOS(tos) } },
750                                     .proto = IPPROTO_GRE };
751                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
752                         stats->tx_carrier_errors++;
753                         goto tx_error;
754                 }
755         }
756         tdev = rt->u.dst.dev;
757
758         if (tdev == dev) {
759                 ip_rt_put(rt);
760                 stats->collisions++;
761                 goto tx_error;
762         }
763
764         df = tiph->frag_off;
765         if (df)
766                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
767         else
768                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
769
770         if (skb_dst(skb))
771                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
772
773         if (skb->protocol == htons(ETH_P_IP)) {
774                 df |= (old_iph->frag_off&htons(IP_DF));
775
776                 if ((old_iph->frag_off&htons(IP_DF)) &&
777                     mtu < ntohs(old_iph->tot_len)) {
778                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
779                         ip_rt_put(rt);
780                         goto tx_error;
781                 }
782         }
783 #ifdef CONFIG_IPV6
784         else if (skb->protocol == htons(ETH_P_IPV6)) {
785                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
786
787                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
788                         if ((tunnel->parms.iph.daddr &&
789                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
790                             rt6->rt6i_dst.plen == 128) {
791                                 rt6->rt6i_flags |= RTF_MODIFIED;
792                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
793                         }
794                 }
795
796                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
797                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
798                         ip_rt_put(rt);
799                         goto tx_error;
800                 }
801         }
802 #endif
803
804         if (tunnel->err_count > 0) {
805                 if (time_before(jiffies,
806                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
807                         tunnel->err_count--;
808
809                         dst_link_failure(skb);
810                 } else
811                         tunnel->err_count = 0;
812         }
813
814         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
815
816         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
817             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
818                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
819                 if (max_headroom > dev->needed_headroom)
820                         dev->needed_headroom = max_headroom;
821                 if (!new_skb) {
822                         ip_rt_put(rt);
823                         txq->tx_dropped++;
824                         dev_kfree_skb(skb);
825                         return NETDEV_TX_OK;
826                 }
827                 if (skb->sk)
828                         skb_set_owner_w(new_skb, skb->sk);
829                 dev_kfree_skb(skb);
830                 skb = new_skb;
831                 old_iph = ip_hdr(skb);
832         }
833
834         skb_reset_transport_header(skb);
835         skb_push(skb, gre_hlen);
836         skb_reset_network_header(skb);
837         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
838         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
839                               IPSKB_REROUTED);
840         skb_dst_drop(skb);
841         skb_dst_set(skb, &rt->u.dst);
842
843         /*
844          *      Push down and install the IPIP header.
845          */
846
847         iph                     =       ip_hdr(skb);
848         iph->version            =       4;
849         iph->ihl                =       sizeof(struct iphdr) >> 2;
850         iph->frag_off           =       df;
851         iph->protocol           =       IPPROTO_GRE;
852         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
853         iph->daddr              =       rt->rt_dst;
854         iph->saddr              =       rt->rt_src;
855
856         if ((iph->ttl = tiph->ttl) == 0) {
857                 if (skb->protocol == htons(ETH_P_IP))
858                         iph->ttl = old_iph->ttl;
859 #ifdef CONFIG_IPV6
860                 else if (skb->protocol == htons(ETH_P_IPV6))
861                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
862 #endif
863                 else
864                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
865         }
866
867         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
868         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
869                                    htons(ETH_P_TEB) : skb->protocol;
870
871         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
872                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
873
874                 if (tunnel->parms.o_flags&GRE_SEQ) {
875                         ++tunnel->o_seqno;
876                         *ptr = htonl(tunnel->o_seqno);
877                         ptr--;
878                 }
879                 if (tunnel->parms.o_flags&GRE_KEY) {
880                         *ptr = tunnel->parms.o_key;
881                         ptr--;
882                 }
883                 if (tunnel->parms.o_flags&GRE_CSUM) {
884                         *ptr = 0;
885                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
886                 }
887         }
888
889         nf_reset(skb);
890
891         IPTUNNEL_XMIT();
892         return NETDEV_TX_OK;
893
894 tx_error_icmp:
895         dst_link_failure(skb);
896
897 tx_error:
898         stats->tx_errors++;
899         dev_kfree_skb(skb);
900         return NETDEV_TX_OK;
901 }
902
903 static int ipgre_tunnel_bind_dev(struct net_device *dev)
904 {
905         struct net_device *tdev = NULL;
906         struct ip_tunnel *tunnel;
907         struct iphdr *iph;
908         int hlen = LL_MAX_HEADER;
909         int mtu = ETH_DATA_LEN;
910         int addend = sizeof(struct iphdr) + 4;
911
912         tunnel = netdev_priv(dev);
913         iph = &tunnel->parms.iph;
914
915         /* Guess output device to choose reasonable mtu and needed_headroom */
916
917         if (iph->daddr) {
918                 struct flowi fl = { .oif = tunnel->parms.link,
919                                     .nl_u = { .ip4_u =
920                                               { .daddr = iph->daddr,
921                                                 .saddr = iph->saddr,
922                                                 .tos = RT_TOS(iph->tos) } },
923                                     .proto = IPPROTO_GRE };
924                 struct rtable *rt;
925                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
926                         tdev = rt->u.dst.dev;
927                         ip_rt_put(rt);
928                 }
929
930                 if (dev->type != ARPHRD_ETHER)
931                         dev->flags |= IFF_POINTOPOINT;
932         }
933
934         if (!tdev && tunnel->parms.link)
935                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
936
937         if (tdev) {
938                 hlen = tdev->hard_header_len + tdev->needed_headroom;
939                 mtu = tdev->mtu;
940         }
941         dev->iflink = tunnel->parms.link;
942
943         /* Precalculate GRE options length */
944         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
945                 if (tunnel->parms.o_flags&GRE_CSUM)
946                         addend += 4;
947                 if (tunnel->parms.o_flags&GRE_KEY)
948                         addend += 4;
949                 if (tunnel->parms.o_flags&GRE_SEQ)
950                         addend += 4;
951         }
952         dev->needed_headroom = addend + hlen;
953         mtu -= dev->hard_header_len + addend;
954
955         if (mtu < 68)
956                 mtu = 68;
957
958         tunnel->hlen = addend;
959
960         return mtu;
961 }
962
963 static int
964 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
965 {
966         int err = 0;
967         struct ip_tunnel_parm p;
968         struct ip_tunnel *t;
969         struct net *net = dev_net(dev);
970         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
971
972         switch (cmd) {
973         case SIOCGETTUNNEL:
974                 t = NULL;
975                 if (dev == ign->fb_tunnel_dev) {
976                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
977                                 err = -EFAULT;
978                                 break;
979                         }
980                         t = ipgre_tunnel_locate(net, &p, 0);
981                 }
982                 if (t == NULL)
983                         t = netdev_priv(dev);
984                 memcpy(&p, &t->parms, sizeof(p));
985                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
986                         err = -EFAULT;
987                 break;
988
989         case SIOCADDTUNNEL:
990         case SIOCCHGTUNNEL:
991                 err = -EPERM;
992                 if (!capable(CAP_NET_ADMIN))
993                         goto done;
994
995                 err = -EFAULT;
996                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
997                         goto done;
998
999                 err = -EINVAL;
1000                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1001                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1002                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1003                         goto done;
1004                 if (p.iph.ttl)
1005                         p.iph.frag_off |= htons(IP_DF);
1006
1007                 if (!(p.i_flags&GRE_KEY))
1008                         p.i_key = 0;
1009                 if (!(p.o_flags&GRE_KEY))
1010                         p.o_key = 0;
1011
1012                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1013
1014                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1015                         if (t != NULL) {
1016                                 if (t->dev != dev) {
1017                                         err = -EEXIST;
1018                                         break;
1019                                 }
1020                         } else {
1021                                 unsigned nflags = 0;
1022
1023                                 t = netdev_priv(dev);
1024
1025                                 if (ipv4_is_multicast(p.iph.daddr))
1026                                         nflags = IFF_BROADCAST;
1027                                 else if (p.iph.daddr)
1028                                         nflags = IFF_POINTOPOINT;
1029
1030                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1031                                         err = -EINVAL;
1032                                         break;
1033                                 }
1034                                 ipgre_tunnel_unlink(ign, t);
1035                                 t->parms.iph.saddr = p.iph.saddr;
1036                                 t->parms.iph.daddr = p.iph.daddr;
1037                                 t->parms.i_key = p.i_key;
1038                                 t->parms.o_key = p.o_key;
1039                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1040                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1041                                 ipgre_tunnel_link(ign, t);
1042                                 netdev_state_change(dev);
1043                         }
1044                 }
1045
1046                 if (t) {
1047                         err = 0;
1048                         if (cmd == SIOCCHGTUNNEL) {
1049                                 t->parms.iph.ttl = p.iph.ttl;
1050                                 t->parms.iph.tos = p.iph.tos;
1051                                 t->parms.iph.frag_off = p.iph.frag_off;
1052                                 if (t->parms.link != p.link) {
1053                                         t->parms.link = p.link;
1054                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1055                                         netdev_state_change(dev);
1056                                 }
1057                         }
1058                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1059                                 err = -EFAULT;
1060                 } else
1061                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1062                 break;
1063
1064         case SIOCDELTUNNEL:
1065                 err = -EPERM;
1066                 if (!capable(CAP_NET_ADMIN))
1067                         goto done;
1068
1069                 if (dev == ign->fb_tunnel_dev) {
1070                         err = -EFAULT;
1071                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1072                                 goto done;
1073                         err = -ENOENT;
1074                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1075                                 goto done;
1076                         err = -EPERM;
1077                         if (t == netdev_priv(ign->fb_tunnel_dev))
1078                                 goto done;
1079                         dev = t->dev;
1080                 }
1081                 unregister_netdevice(dev);
1082                 err = 0;
1083                 break;
1084
1085         default:
1086                 err = -EINVAL;
1087         }
1088
1089 done:
1090         return err;
1091 }
1092
1093 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1094 {
1095         struct ip_tunnel *tunnel = netdev_priv(dev);
1096         if (new_mtu < 68 ||
1097             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1098                 return -EINVAL;
1099         dev->mtu = new_mtu;
1100         return 0;
1101 }
1102
1103 /* Nice toy. Unfortunately, useless in real life :-)
1104    It allows to construct virtual multiprotocol broadcast "LAN"
1105    over the Internet, provided multicast routing is tuned.
1106
1107
1108    I have no idea was this bicycle invented before me,
1109    so that I had to set ARPHRD_IPGRE to a random value.
1110    I have an impression, that Cisco could make something similar,
1111    but this feature is apparently missing in IOS<=11.2(8).
1112
1113    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1114    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1115
1116    ping -t 255 224.66.66.66
1117
1118    If nobody answers, mbone does not work.
1119
1120    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1121    ip addr add 10.66.66.<somewhat>/24 dev Universe
1122    ifconfig Universe up
1123    ifconfig Universe add fe80::<Your_real_addr>/10
1124    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1125    ftp 10.66.66.66
1126    ...
1127    ftp fec0:6666:6666::193.233.7.65
1128    ...
1129
1130  */
1131
1132 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1133                         unsigned short type,
1134                         const void *daddr, const void *saddr, unsigned len)
1135 {
1136         struct ip_tunnel *t = netdev_priv(dev);
1137         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1138         __be16 *p = (__be16*)(iph+1);
1139
1140         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1141         p[0]            = t->parms.o_flags;
1142         p[1]            = htons(type);
1143
1144         /*
1145          *      Set the source hardware address.
1146          */
1147
1148         if (saddr)
1149                 memcpy(&iph->saddr, saddr, 4);
1150         if (daddr)
1151                 memcpy(&iph->daddr, daddr, 4);
1152         if (iph->daddr)
1153                 return t->hlen;
1154
1155         return -t->hlen;
1156 }
1157
1158 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1159 {
1160         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1161         memcpy(haddr, &iph->saddr, 4);
1162         return 4;
1163 }
1164
1165 static const struct header_ops ipgre_header_ops = {
1166         .create = ipgre_header,
1167         .parse  = ipgre_header_parse,
1168 };
1169
1170 #ifdef CONFIG_NET_IPGRE_BROADCAST
1171 static int ipgre_open(struct net_device *dev)
1172 {
1173         struct ip_tunnel *t = netdev_priv(dev);
1174
1175         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1176                 struct flowi fl = { .oif = t->parms.link,
1177                                     .nl_u = { .ip4_u =
1178                                               { .daddr = t->parms.iph.daddr,
1179                                                 .saddr = t->parms.iph.saddr,
1180                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1181                                     .proto = IPPROTO_GRE };
1182                 struct rtable *rt;
1183                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1184                         return -EADDRNOTAVAIL;
1185                 dev = rt->u.dst.dev;
1186                 ip_rt_put(rt);
1187                 if (__in_dev_get_rtnl(dev) == NULL)
1188                         return -EADDRNOTAVAIL;
1189                 t->mlink = dev->ifindex;
1190                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1191         }
1192         return 0;
1193 }
1194
1195 static int ipgre_close(struct net_device *dev)
1196 {
1197         struct ip_tunnel *t = netdev_priv(dev);
1198
1199         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1200                 struct in_device *in_dev;
1201                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1202                 if (in_dev) {
1203                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1204                         in_dev_put(in_dev);
1205                 }
1206         }
1207         return 0;
1208 }
1209
1210 #endif
1211
1212 static const struct net_device_ops ipgre_netdev_ops = {
1213         .ndo_init               = ipgre_tunnel_init,
1214         .ndo_uninit             = ipgre_tunnel_uninit,
1215 #ifdef CONFIG_NET_IPGRE_BROADCAST
1216         .ndo_open               = ipgre_open,
1217         .ndo_stop               = ipgre_close,
1218 #endif
1219         .ndo_start_xmit         = ipgre_tunnel_xmit,
1220         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1221         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1222 };
1223
1224 static void ipgre_tunnel_setup(struct net_device *dev)
1225 {
1226         dev->netdev_ops         = &ipgre_netdev_ops;
1227         dev->destructor         = free_netdev;
1228
1229         dev->type               = ARPHRD_IPGRE;
1230         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1231         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1232         dev->flags              = IFF_NOARP;
1233         dev->iflink             = 0;
1234         dev->addr_len           = 4;
1235         dev->features           |= NETIF_F_NETNS_LOCAL;
1236         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1237 }
1238
1239 static int ipgre_tunnel_init(struct net_device *dev)
1240 {
1241         struct ip_tunnel *tunnel;
1242         struct iphdr *iph;
1243
1244         tunnel = netdev_priv(dev);
1245         iph = &tunnel->parms.iph;
1246
1247         tunnel->dev = dev;
1248         strcpy(tunnel->parms.name, dev->name);
1249
1250         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1251         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1252
1253         if (iph->daddr) {
1254 #ifdef CONFIG_NET_IPGRE_BROADCAST
1255                 if (ipv4_is_multicast(iph->daddr)) {
1256                         if (!iph->saddr)
1257                                 return -EINVAL;
1258                         dev->flags = IFF_BROADCAST;
1259                         dev->header_ops = &ipgre_header_ops;
1260                 }
1261 #endif
1262         } else
1263                 dev->header_ops = &ipgre_header_ops;
1264
1265         return 0;
1266 }
1267
1268 static void ipgre_fb_tunnel_init(struct net_device *dev)
1269 {
1270         struct ip_tunnel *tunnel = netdev_priv(dev);
1271         struct iphdr *iph = &tunnel->parms.iph;
1272         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1273
1274         tunnel->dev = dev;
1275         strcpy(tunnel->parms.name, dev->name);
1276
1277         iph->version            = 4;
1278         iph->protocol           = IPPROTO_GRE;
1279         iph->ihl                = 5;
1280         tunnel->hlen            = sizeof(struct iphdr) + 4;
1281
1282         dev_hold(dev);
1283         ign->tunnels_wc[0]      = tunnel;
1284 }
1285
1286
1287 static const struct net_protocol ipgre_protocol = {
1288         .handler        =       ipgre_rcv,
1289         .err_handler    =       ipgre_err,
1290         .netns_ok       =       1,
1291 };
1292
1293 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1294 {
1295         int prio;
1296
1297         for (prio = 0; prio < 4; prio++) {
1298                 int h;
1299                 for (h = 0; h < HASH_SIZE; h++) {
1300                         struct ip_tunnel *t = ign->tunnels[prio][h];
1301
1302                         while (t != NULL) {
1303                                 unregister_netdevice_queue(t->dev, head);
1304                                 t = t->next;
1305                         }
1306                 }
1307         }
1308 }
1309
1310 static int __net_init ipgre_init_net(struct net *net)
1311 {
1312         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1313         int err;
1314
1315         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316                                            ipgre_tunnel_setup);
1317         if (!ign->fb_tunnel_dev) {
1318                 err = -ENOMEM;
1319                 goto err_alloc_dev;
1320         }
1321         dev_net_set(ign->fb_tunnel_dev, net);
1322
1323         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1324         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1325
1326         if ((err = register_netdev(ign->fb_tunnel_dev)))
1327                 goto err_reg_dev;
1328
1329         return 0;
1330
1331 err_reg_dev:
1332         free_netdev(ign->fb_tunnel_dev);
1333 err_alloc_dev:
1334         return err;
1335 }
1336
1337 static void __net_exit ipgre_exit_net(struct net *net)
1338 {
1339         struct ipgre_net *ign;
1340         LIST_HEAD(list);
1341
1342         ign = net_generic(net, ipgre_net_id);
1343         rtnl_lock();
1344         ipgre_destroy_tunnels(ign, &list);
1345         unregister_netdevice_many(&list);
1346         rtnl_unlock();
1347 }
1348
1349 static struct pernet_operations ipgre_net_ops = {
1350         .init = ipgre_init_net,
1351         .exit = ipgre_exit_net,
1352         .id   = &ipgre_net_id,
1353         .size = sizeof(struct ipgre_net),
1354 };
1355
1356 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357 {
1358         __be16 flags;
1359
1360         if (!data)
1361                 return 0;
1362
1363         flags = 0;
1364         if (data[IFLA_GRE_IFLAGS])
1365                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366         if (data[IFLA_GRE_OFLAGS])
1367                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368         if (flags & (GRE_VERSION|GRE_ROUTING))
1369                 return -EINVAL;
1370
1371         return 0;
1372 }
1373
1374 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375 {
1376         __be32 daddr;
1377
1378         if (tb[IFLA_ADDRESS]) {
1379                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380                         return -EINVAL;
1381                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382                         return -EADDRNOTAVAIL;
1383         }
1384
1385         if (!data)
1386                 goto out;
1387
1388         if (data[IFLA_GRE_REMOTE]) {
1389                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1390                 if (!daddr)
1391                         return -EINVAL;
1392         }
1393
1394 out:
1395         return ipgre_tunnel_validate(tb, data);
1396 }
1397
1398 static void ipgre_netlink_parms(struct nlattr *data[],
1399                                 struct ip_tunnel_parm *parms)
1400 {
1401         memset(parms, 0, sizeof(*parms));
1402
1403         parms->iph.protocol = IPPROTO_GRE;
1404
1405         if (!data)
1406                 return;
1407
1408         if (data[IFLA_GRE_LINK])
1409                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410
1411         if (data[IFLA_GRE_IFLAGS])
1412                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413
1414         if (data[IFLA_GRE_OFLAGS])
1415                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416
1417         if (data[IFLA_GRE_IKEY])
1418                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419
1420         if (data[IFLA_GRE_OKEY])
1421                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422
1423         if (data[IFLA_GRE_LOCAL])
1424                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425
1426         if (data[IFLA_GRE_REMOTE])
1427                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428
1429         if (data[IFLA_GRE_TTL])
1430                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431
1432         if (data[IFLA_GRE_TOS])
1433                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434
1435         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436                 parms->iph.frag_off = htons(IP_DF);
1437 }
1438
1439 static int ipgre_tap_init(struct net_device *dev)
1440 {
1441         struct ip_tunnel *tunnel;
1442
1443         tunnel = netdev_priv(dev);
1444
1445         tunnel->dev = dev;
1446         strcpy(tunnel->parms.name, dev->name);
1447
1448         ipgre_tunnel_bind_dev(dev);
1449
1450         return 0;
1451 }
1452
1453 static const struct net_device_ops ipgre_tap_netdev_ops = {
1454         .ndo_init               = ipgre_tap_init,
1455         .ndo_uninit             = ipgre_tunnel_uninit,
1456         .ndo_start_xmit         = ipgre_tunnel_xmit,
1457         .ndo_set_mac_address    = eth_mac_addr,
1458         .ndo_validate_addr      = eth_validate_addr,
1459         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1460 };
1461
1462 static void ipgre_tap_setup(struct net_device *dev)
1463 {
1464
1465         ether_setup(dev);
1466
1467         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1468         dev->destructor         = free_netdev;
1469
1470         dev->iflink             = 0;
1471         dev->features           |= NETIF_F_NETNS_LOCAL;
1472 }
1473
1474 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1475                          struct nlattr *data[])
1476 {
1477         struct ip_tunnel *nt;
1478         struct net *net = dev_net(dev);
1479         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480         int mtu;
1481         int err;
1482
1483         nt = netdev_priv(dev);
1484         ipgre_netlink_parms(data, &nt->parms);
1485
1486         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1487                 return -EEXIST;
1488
1489         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490                 random_ether_addr(dev->dev_addr);
1491
1492         mtu = ipgre_tunnel_bind_dev(dev);
1493         if (!tb[IFLA_MTU])
1494                 dev->mtu = mtu;
1495
1496         err = register_netdevice(dev);
1497         if (err)
1498                 goto out;
1499
1500         dev_hold(dev);
1501         ipgre_tunnel_link(ign, nt);
1502
1503 out:
1504         return err;
1505 }
1506
1507 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508                             struct nlattr *data[])
1509 {
1510         struct ip_tunnel *t, *nt;
1511         struct net *net = dev_net(dev);
1512         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513         struct ip_tunnel_parm p;
1514         int mtu;
1515
1516         if (dev == ign->fb_tunnel_dev)
1517                 return -EINVAL;
1518
1519         nt = netdev_priv(dev);
1520         ipgre_netlink_parms(data, &p);
1521
1522         t = ipgre_tunnel_locate(net, &p, 0);
1523
1524         if (t) {
1525                 if (t->dev != dev)
1526                         return -EEXIST;
1527         } else {
1528                 t = nt;
1529
1530                 if (dev->type != ARPHRD_ETHER) {
1531                         unsigned nflags = 0;
1532
1533                         if (ipv4_is_multicast(p.iph.daddr))
1534                                 nflags = IFF_BROADCAST;
1535                         else if (p.iph.daddr)
1536                                 nflags = IFF_POINTOPOINT;
1537
1538                         if ((dev->flags ^ nflags) &
1539                             (IFF_POINTOPOINT | IFF_BROADCAST))
1540                                 return -EINVAL;
1541                 }
1542
1543                 ipgre_tunnel_unlink(ign, t);
1544                 t->parms.iph.saddr = p.iph.saddr;
1545                 t->parms.iph.daddr = p.iph.daddr;
1546                 t->parms.i_key = p.i_key;
1547                 if (dev->type != ARPHRD_ETHER) {
1548                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1549                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1550                 }
1551                 ipgre_tunnel_link(ign, t);
1552                 netdev_state_change(dev);
1553         }
1554
1555         t->parms.o_key = p.o_key;
1556         t->parms.iph.ttl = p.iph.ttl;
1557         t->parms.iph.tos = p.iph.tos;
1558         t->parms.iph.frag_off = p.iph.frag_off;
1559
1560         if (t->parms.link != p.link) {
1561                 t->parms.link = p.link;
1562                 mtu = ipgre_tunnel_bind_dev(dev);
1563                 if (!tb[IFLA_MTU])
1564                         dev->mtu = mtu;
1565                 netdev_state_change(dev);
1566         }
1567
1568         return 0;
1569 }
1570
1571 static size_t ipgre_get_size(const struct net_device *dev)
1572 {
1573         return
1574                 /* IFLA_GRE_LINK */
1575                 nla_total_size(4) +
1576                 /* IFLA_GRE_IFLAGS */
1577                 nla_total_size(2) +
1578                 /* IFLA_GRE_OFLAGS */
1579                 nla_total_size(2) +
1580                 /* IFLA_GRE_IKEY */
1581                 nla_total_size(4) +
1582                 /* IFLA_GRE_OKEY */
1583                 nla_total_size(4) +
1584                 /* IFLA_GRE_LOCAL */
1585                 nla_total_size(4) +
1586                 /* IFLA_GRE_REMOTE */
1587                 nla_total_size(4) +
1588                 /* IFLA_GRE_TTL */
1589                 nla_total_size(1) +
1590                 /* IFLA_GRE_TOS */
1591                 nla_total_size(1) +
1592                 /* IFLA_GRE_PMTUDISC */
1593                 nla_total_size(1) +
1594                 0;
1595 }
1596
1597 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1598 {
1599         struct ip_tunnel *t = netdev_priv(dev);
1600         struct ip_tunnel_parm *p = &t->parms;
1601
1602         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1603         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1604         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1605         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1606         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1607         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1608         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1609         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1610         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1611         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1612
1613         return 0;
1614
1615 nla_put_failure:
1616         return -EMSGSIZE;
1617 }
1618
1619 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1620         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1621         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1622         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1623         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1624         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1625         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1626         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1627         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1628         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1629         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1630 };
1631
1632 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1633         .kind           = "gre",
1634         .maxtype        = IFLA_GRE_MAX,
1635         .policy         = ipgre_policy,
1636         .priv_size      = sizeof(struct ip_tunnel),
1637         .setup          = ipgre_tunnel_setup,
1638         .validate       = ipgre_tunnel_validate,
1639         .newlink        = ipgre_newlink,
1640         .changelink     = ipgre_changelink,
1641         .get_size       = ipgre_get_size,
1642         .fill_info      = ipgre_fill_info,
1643 };
1644
1645 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1646         .kind           = "gretap",
1647         .maxtype        = IFLA_GRE_MAX,
1648         .policy         = ipgre_policy,
1649         .priv_size      = sizeof(struct ip_tunnel),
1650         .setup          = ipgre_tap_setup,
1651         .validate       = ipgre_tap_validate,
1652         .newlink        = ipgre_newlink,
1653         .changelink     = ipgre_changelink,
1654         .get_size       = ipgre_get_size,
1655         .fill_info      = ipgre_fill_info,
1656 };
1657
1658 /*
1659  *      And now the modules code and kernel interface.
1660  */
1661
1662 static int __init ipgre_init(void)
1663 {
1664         int err;
1665
1666         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1667
1668         err = register_pernet_device(&ipgre_net_ops);
1669         if (err < 0)
1670                 return err;
1671
1672         err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1673         if (err < 0) {
1674                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1675                 goto add_proto_failed;
1676         }
1677
1678         err = rtnl_link_register(&ipgre_link_ops);
1679         if (err < 0)
1680                 goto rtnl_link_failed;
1681
1682         err = rtnl_link_register(&ipgre_tap_ops);
1683         if (err < 0)
1684                 goto tap_ops_failed;
1685
1686 out:
1687         return err;
1688
1689 tap_ops_failed:
1690         rtnl_link_unregister(&ipgre_link_ops);
1691 rtnl_link_failed:
1692         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1693 add_proto_failed:
1694         unregister_pernet_device(&ipgre_net_ops);
1695         goto out;
1696 }
1697
1698 static void __exit ipgre_fini(void)
1699 {
1700         rtnl_link_unregister(&ipgre_tap_ops);
1701         rtnl_link_unregister(&ipgre_link_ops);
1702         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704         unregister_pernet_device(&ipgre_net_ops);
1705 }
1706
1707 module_init(ipgre_init);
1708 module_exit(ipgre_fini);
1709 MODULE_LICENSE("GPL");
1710 MODULE_ALIAS_RTNL_LINK("gre");
1711 MODULE_ALIAS_RTNL_LINK("gretap");