net: Introduce skb_tunnel_rx() helper
[safe/jmp/linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47
48 #ifdef CONFIG_IPV6
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #endif
53
54 /*
55    Problems & solutions
56    --------------------
57
58    1. The most important issue is detecting local dead loops.
59    They would cause complete host lockup in transmit, which
60    would be "resolved" by stack overflow or, if queueing is enabled,
61    with infinite looping in net_bh.
62
63    We cannot track such dead loops during route installation,
64    it is infeasible task. The most general solutions would be
65    to keep skb->encapsulation counter (sort of local ttl),
66    and silently drop packet when it expires. It is the best
67    solution, but it supposes maintaing new variable in ALL
68    skb, even if no tunneling is used.
69
70    Current solution: HARD_TX_LOCK lock breaks dead loops.
71
72
73
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107
108
109
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116
117    Alexey Kuznetsov.
118  */
119
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
122 static void ipgre_tunnel_setup(struct net_device *dev);
123 static int ipgre_tunnel_bind_dev(struct net_device *dev);
124
125 /* Fallback tunnel: no source, no destination, no key, no options */
126
127 #define HASH_SIZE  16
128
129 static int ipgre_net_id __read_mostly;
130 struct ipgre_net {
131         struct ip_tunnel *tunnels[4][HASH_SIZE];
132
133         struct net_device *fb_tunnel_dev;
134 };
135
136 /* Tunnel hash table */
137
138 /*
139    4 hash tables:
140
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155
156 #define tunnels_r_l     tunnels[3]
157 #define tunnels_r       tunnels[2]
158 #define tunnels_l       tunnels[1]
159 #define tunnels_wc      tunnels[0]
160 /*
161  * Locking : hash tables are protected by RCU and a spinlock
162  */
163 static DEFINE_SPINLOCK(ipgre_lock);
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* Given src, dst and key, find appropriate for input tunnel. */
169
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171                                               __be32 remote, __be32 local,
172                                               __be32 key, __be16 gre_proto)
173 {
174         struct net *net = dev_net(dev);
175         int link = dev->ifindex;
176         unsigned h0 = HASH(remote);
177         unsigned h1 = HASH(key);
178         struct ip_tunnel *t, *cand = NULL;
179         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181                        ARPHRD_ETHER : ARPHRD_IPGRE;
182         int score, cand_score = 4;
183
184         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185                 if (local != t->parms.iph.saddr ||
186                     remote != t->parms.iph.daddr ||
187                     key != t->parms.i_key ||
188                     !(t->dev->flags & IFF_UP))
189                         continue;
190
191                 if (t->dev->type != ARPHRD_IPGRE &&
192                     t->dev->type != dev_type)
193                         continue;
194
195                 score = 0;
196                 if (t->parms.link != link)
197                         score |= 1;
198                 if (t->dev->type != dev_type)
199                         score |= 2;
200                 if (score == 0)
201                         return t;
202
203                 if (score < cand_score) {
204                         cand = t;
205                         cand_score = score;
206                 }
207         }
208
209         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210                 if (remote != t->parms.iph.daddr ||
211                     key != t->parms.i_key ||
212                     !(t->dev->flags & IFF_UP))
213                         continue;
214
215                 if (t->dev->type != ARPHRD_IPGRE &&
216                     t->dev->type != dev_type)
217                         continue;
218
219                 score = 0;
220                 if (t->parms.link != link)
221                         score |= 1;
222                 if (t->dev->type != dev_type)
223                         score |= 2;
224                 if (score == 0)
225                         return t;
226
227                 if (score < cand_score) {
228                         cand = t;
229                         cand_score = score;
230                 }
231         }
232
233         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234                 if ((local != t->parms.iph.saddr &&
235                      (local != t->parms.iph.daddr ||
236                       !ipv4_is_multicast(local))) ||
237                     key != t->parms.i_key ||
238                     !(t->dev->flags & IFF_UP))
239                         continue;
240
241                 if (t->dev->type != ARPHRD_IPGRE &&
242                     t->dev->type != dev_type)
243                         continue;
244
245                 score = 0;
246                 if (t->parms.link != link)
247                         score |= 1;
248                 if (t->dev->type != dev_type)
249                         score |= 2;
250                 if (score == 0)
251                         return t;
252
253                 if (score < cand_score) {
254                         cand = t;
255                         cand_score = score;
256                 }
257         }
258
259         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260                 if (t->parms.i_key != key ||
261                     !(t->dev->flags & IFF_UP))
262                         continue;
263
264                 if (t->dev->type != ARPHRD_IPGRE &&
265                     t->dev->type != dev_type)
266                         continue;
267
268                 score = 0;
269                 if (t->parms.link != link)
270                         score |= 1;
271                 if (t->dev->type != dev_type)
272                         score |= 2;
273                 if (score == 0)
274                         return t;
275
276                 if (score < cand_score) {
277                         cand = t;
278                         cand_score = score;
279                 }
280         }
281
282         if (cand != NULL)
283                 return cand;
284
285         dev = ign->fb_tunnel_dev;
286         if (dev->flags & IFF_UP)
287                 return netdev_priv(dev);
288
289         return NULL;
290 }
291
292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
293                 struct ip_tunnel_parm *parms)
294 {
295         __be32 remote = parms->iph.daddr;
296         __be32 local = parms->iph.saddr;
297         __be32 key = parms->i_key;
298         unsigned h = HASH(key);
299         int prio = 0;
300
301         if (local)
302                 prio |= 1;
303         if (remote && !ipv4_is_multicast(remote)) {
304                 prio |= 2;
305                 h ^= HASH(remote);
306         }
307
308         return &ign->tunnels[prio][h];
309 }
310
311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
312                 struct ip_tunnel *t)
313 {
314         return __ipgre_bucket(ign, &t->parms);
315 }
316
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318 {
319         struct ip_tunnel **tp = ipgre_bucket(ign, t);
320
321         spin_lock_bh(&ipgre_lock);
322         t->next = *tp;
323         rcu_assign_pointer(*tp, t);
324         spin_unlock_bh(&ipgre_lock);
325 }
326
327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328 {
329         struct ip_tunnel **tp;
330
331         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
332                 if (t == *tp) {
333                         spin_lock_bh(&ipgre_lock);
334                         *tp = t->next;
335                         spin_unlock_bh(&ipgre_lock);
336                         break;
337                 }
338         }
339 }
340
341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342                                            struct ip_tunnel_parm *parms,
343                                            int type)
344 {
345         __be32 remote = parms->iph.daddr;
346         __be32 local = parms->iph.saddr;
347         __be32 key = parms->i_key;
348         int link = parms->link;
349         struct ip_tunnel *t, **tp;
350         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351
352         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
353                 if (local == t->parms.iph.saddr &&
354                     remote == t->parms.iph.daddr &&
355                     key == t->parms.i_key &&
356                     link == t->parms.link &&
357                     type == t->dev->type)
358                         break;
359
360         return t;
361 }
362
363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
364                 struct ip_tunnel_parm *parms, int create)
365 {
366         struct ip_tunnel *t, *nt;
367         struct net_device *dev;
368         char name[IFNAMSIZ];
369         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
370
371         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
372         if (t || !create)
373                 return t;
374
375         if (parms->name[0])
376                 strlcpy(name, parms->name, IFNAMSIZ);
377         else
378                 sprintf(name, "gre%%d");
379
380         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
381         if (!dev)
382           return NULL;
383
384         dev_net_set(dev, net);
385
386         if (strchr(name, '%')) {
387                 if (dev_alloc_name(dev, name) < 0)
388                         goto failed_free;
389         }
390
391         nt = netdev_priv(dev);
392         nt->parms = *parms;
393         dev->rtnl_link_ops = &ipgre_link_ops;
394
395         dev->mtu = ipgre_tunnel_bind_dev(dev);
396
397         if (register_netdevice(dev) < 0)
398                 goto failed_free;
399
400         dev_hold(dev);
401         ipgre_tunnel_link(ign, nt);
402         return nt;
403
404 failed_free:
405         free_netdev(dev);
406         return NULL;
407 }
408
409 static void ipgre_tunnel_uninit(struct net_device *dev)
410 {
411         struct net *net = dev_net(dev);
412         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
413
414         ipgre_tunnel_unlink(ign, netdev_priv(dev));
415         dev_put(dev);
416 }
417
418
419 static void ipgre_err(struct sk_buff *skb, u32 info)
420 {
421
422 /* All the routers (except for Linux) return only
423    8 bytes of packet payload. It means, that precise relaying of
424    ICMP in the real Internet is absolutely infeasible.
425
426    Moreover, Cisco "wise men" put GRE key to the third word
427    in GRE header. It makes impossible maintaining even soft state for keyed
428    GRE tunnels with enabled checksum. Tell them "thank you".
429
430    Well, I wonder, rfc1812 was written by Cisco employee,
431    what the hell these idiots break standrads established
432    by themself???
433  */
434
435         struct iphdr *iph = (struct iphdr *)skb->data;
436         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
437         int grehlen = (iph->ihl<<2) + 4;
438         const int type = icmp_hdr(skb)->type;
439         const int code = icmp_hdr(skb)->code;
440         struct ip_tunnel *t;
441         __be16 flags;
442
443         flags = p[0];
444         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
445                 if (flags&(GRE_VERSION|GRE_ROUTING))
446                         return;
447                 if (flags&GRE_KEY) {
448                         grehlen += 4;
449                         if (flags&GRE_CSUM)
450                                 grehlen += 4;
451                 }
452         }
453
454         /* If only 8 bytes returned, keyed message will be dropped here */
455         if (skb_headlen(skb) < grehlen)
456                 return;
457
458         switch (type) {
459         default:
460         case ICMP_PARAMETERPROB:
461                 return;
462
463         case ICMP_DEST_UNREACH:
464                 switch (code) {
465                 case ICMP_SR_FAILED:
466                 case ICMP_PORT_UNREACH:
467                         /* Impossible event. */
468                         return;
469                 case ICMP_FRAG_NEEDED:
470                         /* Soft state for pmtu is maintained by IP core. */
471                         return;
472                 default:
473                         /* All others are translated to HOST_UNREACH.
474                            rfc2003 contains "deep thoughts" about NET_UNREACH,
475                            I believe they are just ether pollution. --ANK
476                          */
477                         break;
478                 }
479                 break;
480         case ICMP_TIME_EXCEEDED:
481                 if (code != ICMP_EXC_TTL)
482                         return;
483                 break;
484         }
485
486         rcu_read_lock();
487         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
488                                 flags & GRE_KEY ?
489                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
490                                 p[1]);
491         if (t == NULL || t->parms.iph.daddr == 0 ||
492             ipv4_is_multicast(t->parms.iph.daddr))
493                 goto out;
494
495         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
496                 goto out;
497
498         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
499                 t->err_count++;
500         else
501                 t->err_count = 1;
502         t->err_time = jiffies;
503 out:
504         rcu_read_unlock();
505         return;
506 }
507
508 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
509 {
510         if (INET_ECN_is_ce(iph->tos)) {
511                 if (skb->protocol == htons(ETH_P_IP)) {
512                         IP_ECN_set_ce(ip_hdr(skb));
513                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
514                         IP6_ECN_set_ce(ipv6_hdr(skb));
515                 }
516         }
517 }
518
519 static inline u8
520 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
521 {
522         u8 inner = 0;
523         if (skb->protocol == htons(ETH_P_IP))
524                 inner = old_iph->tos;
525         else if (skb->protocol == htons(ETH_P_IPV6))
526                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
527         return INET_ECN_encapsulate(tos, inner);
528 }
529
530 static int ipgre_rcv(struct sk_buff *skb)
531 {
532         struct iphdr *iph;
533         u8     *h;
534         __be16    flags;
535         __sum16   csum = 0;
536         __be32 key = 0;
537         u32    seqno = 0;
538         struct ip_tunnel *tunnel;
539         int    offset = 4;
540         __be16 gre_proto;
541
542         if (!pskb_may_pull(skb, 16))
543                 goto drop_nolock;
544
545         iph = ip_hdr(skb);
546         h = skb->data;
547         flags = *(__be16*)h;
548
549         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550                 /* - Version must be 0.
551                    - We do not support routing headers.
552                  */
553                 if (flags&(GRE_VERSION|GRE_ROUTING))
554                         goto drop_nolock;
555
556                 if (flags&GRE_CSUM) {
557                         switch (skb->ip_summed) {
558                         case CHECKSUM_COMPLETE:
559                                 csum = csum_fold(skb->csum);
560                                 if (!csum)
561                                         break;
562                                 /* fall through */
563                         case CHECKSUM_NONE:
564                                 skb->csum = 0;
565                                 csum = __skb_checksum_complete(skb);
566                                 skb->ip_summed = CHECKSUM_COMPLETE;
567                         }
568                         offset += 4;
569                 }
570                 if (flags&GRE_KEY) {
571                         key = *(__be32*)(h + offset);
572                         offset += 4;
573                 }
574                 if (flags&GRE_SEQ) {
575                         seqno = ntohl(*(__be32*)(h + offset));
576                         offset += 4;
577                 }
578         }
579
580         gre_proto = *(__be16 *)(h + 2);
581
582         rcu_read_lock();
583         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
584                                           iph->saddr, iph->daddr, key,
585                                           gre_proto))) {
586                 struct net_device_stats *stats = &tunnel->dev->stats;
587
588                 secpath_reset(skb);
589
590                 skb->protocol = gre_proto;
591                 /* WCCP version 1 and 2 protocol decoding.
592                  * - Change protocol to IP
593                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
594                  */
595                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
596                         skb->protocol = htons(ETH_P_IP);
597                         if ((*(h + offset) & 0xF0) != 0x40)
598                                 offset += 4;
599                 }
600
601                 skb->mac_header = skb->network_header;
602                 __pskb_pull(skb, offset);
603                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
604                 skb->pkt_type = PACKET_HOST;
605 #ifdef CONFIG_NET_IPGRE_BROADCAST
606                 if (ipv4_is_multicast(iph->daddr)) {
607                         /* Looped back packet, drop it! */
608                         if (skb_rtable(skb)->fl.iif == 0)
609                                 goto drop;
610                         stats->multicast++;
611                         skb->pkt_type = PACKET_BROADCAST;
612                 }
613 #endif
614
615                 if (((flags&GRE_CSUM) && csum) ||
616                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
617                         stats->rx_crc_errors++;
618                         stats->rx_errors++;
619                         goto drop;
620                 }
621                 if (tunnel->parms.i_flags&GRE_SEQ) {
622                         if (!(flags&GRE_SEQ) ||
623                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
624                                 stats->rx_fifo_errors++;
625                                 stats->rx_errors++;
626                                 goto drop;
627                         }
628                         tunnel->i_seqno = seqno + 1;
629                 }
630
631                 /* Warning: All skb pointers will be invalidated! */
632                 if (tunnel->dev->type == ARPHRD_ETHER) {
633                         if (!pskb_may_pull(skb, ETH_HLEN)) {
634                                 stats->rx_length_errors++;
635                                 stats->rx_errors++;
636                                 goto drop;
637                         }
638
639                         iph = ip_hdr(skb);
640                         skb->protocol = eth_type_trans(skb, tunnel->dev);
641                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
642                 }
643
644                 skb_tunnel_rx(skb, tunnel->dev);
645
646                 skb_reset_network_header(skb);
647                 ipgre_ecn_decapsulate(iph, skb);
648
649                 netif_rx(skb);
650                 rcu_read_unlock();
651                 return(0);
652         }
653         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654
655 drop:
656         rcu_read_unlock();
657 drop_nolock:
658         kfree_skb(skb);
659         return(0);
660 }
661
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
663 {
664         struct ip_tunnel *tunnel = netdev_priv(dev);
665         struct net_device_stats *stats = &dev->stats;
666         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
667         struct iphdr  *old_iph = ip_hdr(skb);
668         struct iphdr  *tiph;
669         u8     tos;
670         __be16 df;
671         struct rtable *rt;                      /* Route to the other host */
672         struct net_device *tdev;                        /* Device to other host */
673         struct iphdr  *iph;                     /* Our new IP header */
674         unsigned int max_headroom;              /* The extra header space needed */
675         int    gre_hlen;
676         __be32 dst;
677         int    mtu;
678
679         if (dev->type == ARPHRD_ETHER)
680                 IPCB(skb)->flags = 0;
681
682         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
683                 gre_hlen = 0;
684                 tiph = (struct iphdr *)skb->data;
685         } else {
686                 gre_hlen = tunnel->hlen;
687                 tiph = &tunnel->parms.iph;
688         }
689
690         if ((dst = tiph->daddr) == 0) {
691                 /* NBMA tunnel */
692
693                 if (skb_dst(skb) == NULL) {
694                         stats->tx_fifo_errors++;
695                         goto tx_error;
696                 }
697
698                 if (skb->protocol == htons(ETH_P_IP)) {
699                         rt = skb_rtable(skb);
700                         if ((dst = rt->rt_gateway) == 0)
701                                 goto tx_error_icmp;
702                 }
703 #ifdef CONFIG_IPV6
704                 else if (skb->protocol == htons(ETH_P_IPV6)) {
705                         struct in6_addr *addr6;
706                         int addr_type;
707                         struct neighbour *neigh = skb_dst(skb)->neighbour;
708
709                         if (neigh == NULL)
710                                 goto tx_error;
711
712                         addr6 = (struct in6_addr *)&neigh->primary_key;
713                         addr_type = ipv6_addr_type(addr6);
714
715                         if (addr_type == IPV6_ADDR_ANY) {
716                                 addr6 = &ipv6_hdr(skb)->daddr;
717                                 addr_type = ipv6_addr_type(addr6);
718                         }
719
720                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
721                                 goto tx_error_icmp;
722
723                         dst = addr6->s6_addr32[3];
724                 }
725 #endif
726                 else
727                         goto tx_error;
728         }
729
730         tos = tiph->tos;
731         if (tos == 1) {
732                 tos = 0;
733                 if (skb->protocol == htons(ETH_P_IP))
734                         tos = old_iph->tos;
735         }
736
737         {
738                 struct flowi fl = { .oif = tunnel->parms.link,
739                                     .nl_u = { .ip4_u =
740                                               { .daddr = dst,
741                                                 .saddr = tiph->saddr,
742                                                 .tos = RT_TOS(tos) } },
743                                     .proto = IPPROTO_GRE };
744                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
745                         stats->tx_carrier_errors++;
746                         goto tx_error;
747                 }
748         }
749         tdev = rt->u.dst.dev;
750
751         if (tdev == dev) {
752                 ip_rt_put(rt);
753                 stats->collisions++;
754                 goto tx_error;
755         }
756
757         df = tiph->frag_off;
758         if (df)
759                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
760         else
761                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
762
763         if (skb_dst(skb))
764                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
765
766         if (skb->protocol == htons(ETH_P_IP)) {
767                 df |= (old_iph->frag_off&htons(IP_DF));
768
769                 if ((old_iph->frag_off&htons(IP_DF)) &&
770                     mtu < ntohs(old_iph->tot_len)) {
771                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
772                         ip_rt_put(rt);
773                         goto tx_error;
774                 }
775         }
776 #ifdef CONFIG_IPV6
777         else if (skb->protocol == htons(ETH_P_IPV6)) {
778                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
779
780                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
781                         if ((tunnel->parms.iph.daddr &&
782                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
783                             rt6->rt6i_dst.plen == 128) {
784                                 rt6->rt6i_flags |= RTF_MODIFIED;
785                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
786                         }
787                 }
788
789                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
790                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
791                         ip_rt_put(rt);
792                         goto tx_error;
793                 }
794         }
795 #endif
796
797         if (tunnel->err_count > 0) {
798                 if (time_before(jiffies,
799                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
800                         tunnel->err_count--;
801
802                         dst_link_failure(skb);
803                 } else
804                         tunnel->err_count = 0;
805         }
806
807         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
808
809         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
810             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
811                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
812                 if (max_headroom > dev->needed_headroom)
813                         dev->needed_headroom = max_headroom;
814                 if (!new_skb) {
815                         ip_rt_put(rt);
816                         txq->tx_dropped++;
817                         dev_kfree_skb(skb);
818                         return NETDEV_TX_OK;
819                 }
820                 if (skb->sk)
821                         skb_set_owner_w(new_skb, skb->sk);
822                 dev_kfree_skb(skb);
823                 skb = new_skb;
824                 old_iph = ip_hdr(skb);
825         }
826
827         skb_reset_transport_header(skb);
828         skb_push(skb, gre_hlen);
829         skb_reset_network_header(skb);
830         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
831         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
832                               IPSKB_REROUTED);
833         skb_dst_drop(skb);
834         skb_dst_set(skb, &rt->u.dst);
835
836         /*
837          *      Push down and install the IPIP header.
838          */
839
840         iph                     =       ip_hdr(skb);
841         iph->version            =       4;
842         iph->ihl                =       sizeof(struct iphdr) >> 2;
843         iph->frag_off           =       df;
844         iph->protocol           =       IPPROTO_GRE;
845         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
846         iph->daddr              =       rt->rt_dst;
847         iph->saddr              =       rt->rt_src;
848
849         if ((iph->ttl = tiph->ttl) == 0) {
850                 if (skb->protocol == htons(ETH_P_IP))
851                         iph->ttl = old_iph->ttl;
852 #ifdef CONFIG_IPV6
853                 else if (skb->protocol == htons(ETH_P_IPV6))
854                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
855 #endif
856                 else
857                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
858         }
859
860         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
861         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
862                                    htons(ETH_P_TEB) : skb->protocol;
863
864         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
865                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
866
867                 if (tunnel->parms.o_flags&GRE_SEQ) {
868                         ++tunnel->o_seqno;
869                         *ptr = htonl(tunnel->o_seqno);
870                         ptr--;
871                 }
872                 if (tunnel->parms.o_flags&GRE_KEY) {
873                         *ptr = tunnel->parms.o_key;
874                         ptr--;
875                 }
876                 if (tunnel->parms.o_flags&GRE_CSUM) {
877                         *ptr = 0;
878                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
879                 }
880         }
881
882         nf_reset(skb);
883
884         IPTUNNEL_XMIT();
885         return NETDEV_TX_OK;
886
887 tx_error_icmp:
888         dst_link_failure(skb);
889
890 tx_error:
891         stats->tx_errors++;
892         dev_kfree_skb(skb);
893         return NETDEV_TX_OK;
894 }
895
896 static int ipgre_tunnel_bind_dev(struct net_device *dev)
897 {
898         struct net_device *tdev = NULL;
899         struct ip_tunnel *tunnel;
900         struct iphdr *iph;
901         int hlen = LL_MAX_HEADER;
902         int mtu = ETH_DATA_LEN;
903         int addend = sizeof(struct iphdr) + 4;
904
905         tunnel = netdev_priv(dev);
906         iph = &tunnel->parms.iph;
907
908         /* Guess output device to choose reasonable mtu and needed_headroom */
909
910         if (iph->daddr) {
911                 struct flowi fl = { .oif = tunnel->parms.link,
912                                     .nl_u = { .ip4_u =
913                                               { .daddr = iph->daddr,
914                                                 .saddr = iph->saddr,
915                                                 .tos = RT_TOS(iph->tos) } },
916                                     .proto = IPPROTO_GRE };
917                 struct rtable *rt;
918                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
919                         tdev = rt->u.dst.dev;
920                         ip_rt_put(rt);
921                 }
922
923                 if (dev->type != ARPHRD_ETHER)
924                         dev->flags |= IFF_POINTOPOINT;
925         }
926
927         if (!tdev && tunnel->parms.link)
928                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
929
930         if (tdev) {
931                 hlen = tdev->hard_header_len + tdev->needed_headroom;
932                 mtu = tdev->mtu;
933         }
934         dev->iflink = tunnel->parms.link;
935
936         /* Precalculate GRE options length */
937         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
938                 if (tunnel->parms.o_flags&GRE_CSUM)
939                         addend += 4;
940                 if (tunnel->parms.o_flags&GRE_KEY)
941                         addend += 4;
942                 if (tunnel->parms.o_flags&GRE_SEQ)
943                         addend += 4;
944         }
945         dev->needed_headroom = addend + hlen;
946         mtu -= dev->hard_header_len + addend;
947
948         if (mtu < 68)
949                 mtu = 68;
950
951         tunnel->hlen = addend;
952
953         return mtu;
954 }
955
956 static int
957 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
958 {
959         int err = 0;
960         struct ip_tunnel_parm p;
961         struct ip_tunnel *t;
962         struct net *net = dev_net(dev);
963         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
964
965         switch (cmd) {
966         case SIOCGETTUNNEL:
967                 t = NULL;
968                 if (dev == ign->fb_tunnel_dev) {
969                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
970                                 err = -EFAULT;
971                                 break;
972                         }
973                         t = ipgre_tunnel_locate(net, &p, 0);
974                 }
975                 if (t == NULL)
976                         t = netdev_priv(dev);
977                 memcpy(&p, &t->parms, sizeof(p));
978                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
979                         err = -EFAULT;
980                 break;
981
982         case SIOCADDTUNNEL:
983         case SIOCCHGTUNNEL:
984                 err = -EPERM;
985                 if (!capable(CAP_NET_ADMIN))
986                         goto done;
987
988                 err = -EFAULT;
989                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
990                         goto done;
991
992                 err = -EINVAL;
993                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
994                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
995                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
996                         goto done;
997                 if (p.iph.ttl)
998                         p.iph.frag_off |= htons(IP_DF);
999
1000                 if (!(p.i_flags&GRE_KEY))
1001                         p.i_key = 0;
1002                 if (!(p.o_flags&GRE_KEY))
1003                         p.o_key = 0;
1004
1005                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1006
1007                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1008                         if (t != NULL) {
1009                                 if (t->dev != dev) {
1010                                         err = -EEXIST;
1011                                         break;
1012                                 }
1013                         } else {
1014                                 unsigned nflags = 0;
1015
1016                                 t = netdev_priv(dev);
1017
1018                                 if (ipv4_is_multicast(p.iph.daddr))
1019                                         nflags = IFF_BROADCAST;
1020                                 else if (p.iph.daddr)
1021                                         nflags = IFF_POINTOPOINT;
1022
1023                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1024                                         err = -EINVAL;
1025                                         break;
1026                                 }
1027                                 ipgre_tunnel_unlink(ign, t);
1028                                 t->parms.iph.saddr = p.iph.saddr;
1029                                 t->parms.iph.daddr = p.iph.daddr;
1030                                 t->parms.i_key = p.i_key;
1031                                 t->parms.o_key = p.o_key;
1032                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1033                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1034                                 ipgre_tunnel_link(ign, t);
1035                                 netdev_state_change(dev);
1036                         }
1037                 }
1038
1039                 if (t) {
1040                         err = 0;
1041                         if (cmd == SIOCCHGTUNNEL) {
1042                                 t->parms.iph.ttl = p.iph.ttl;
1043                                 t->parms.iph.tos = p.iph.tos;
1044                                 t->parms.iph.frag_off = p.iph.frag_off;
1045                                 if (t->parms.link != p.link) {
1046                                         t->parms.link = p.link;
1047                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1048                                         netdev_state_change(dev);
1049                                 }
1050                         }
1051                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1052                                 err = -EFAULT;
1053                 } else
1054                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1055                 break;
1056
1057         case SIOCDELTUNNEL:
1058                 err = -EPERM;
1059                 if (!capable(CAP_NET_ADMIN))
1060                         goto done;
1061
1062                 if (dev == ign->fb_tunnel_dev) {
1063                         err = -EFAULT;
1064                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1065                                 goto done;
1066                         err = -ENOENT;
1067                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1068                                 goto done;
1069                         err = -EPERM;
1070                         if (t == netdev_priv(ign->fb_tunnel_dev))
1071                                 goto done;
1072                         dev = t->dev;
1073                 }
1074                 unregister_netdevice(dev);
1075                 err = 0;
1076                 break;
1077
1078         default:
1079                 err = -EINVAL;
1080         }
1081
1082 done:
1083         return err;
1084 }
1085
1086 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1087 {
1088         struct ip_tunnel *tunnel = netdev_priv(dev);
1089         if (new_mtu < 68 ||
1090             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1091                 return -EINVAL;
1092         dev->mtu = new_mtu;
1093         return 0;
1094 }
1095
1096 /* Nice toy. Unfortunately, useless in real life :-)
1097    It allows to construct virtual multiprotocol broadcast "LAN"
1098    over the Internet, provided multicast routing is tuned.
1099
1100
1101    I have no idea was this bicycle invented before me,
1102    so that I had to set ARPHRD_IPGRE to a random value.
1103    I have an impression, that Cisco could make something similar,
1104    but this feature is apparently missing in IOS<=11.2(8).
1105
1106    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1107    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1108
1109    ping -t 255 224.66.66.66
1110
1111    If nobody answers, mbone does not work.
1112
1113    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1114    ip addr add 10.66.66.<somewhat>/24 dev Universe
1115    ifconfig Universe up
1116    ifconfig Universe add fe80::<Your_real_addr>/10
1117    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1118    ftp 10.66.66.66
1119    ...
1120    ftp fec0:6666:6666::193.233.7.65
1121    ...
1122
1123  */
1124
1125 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1126                         unsigned short type,
1127                         const void *daddr, const void *saddr, unsigned len)
1128 {
1129         struct ip_tunnel *t = netdev_priv(dev);
1130         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1131         __be16 *p = (__be16*)(iph+1);
1132
1133         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1134         p[0]            = t->parms.o_flags;
1135         p[1]            = htons(type);
1136
1137         /*
1138          *      Set the source hardware address.
1139          */
1140
1141         if (saddr)
1142                 memcpy(&iph->saddr, saddr, 4);
1143         if (daddr)
1144                 memcpy(&iph->daddr, daddr, 4);
1145         if (iph->daddr)
1146                 return t->hlen;
1147
1148         return -t->hlen;
1149 }
1150
1151 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1152 {
1153         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1154         memcpy(haddr, &iph->saddr, 4);
1155         return 4;
1156 }
1157
1158 static const struct header_ops ipgre_header_ops = {
1159         .create = ipgre_header,
1160         .parse  = ipgre_header_parse,
1161 };
1162
1163 #ifdef CONFIG_NET_IPGRE_BROADCAST
1164 static int ipgre_open(struct net_device *dev)
1165 {
1166         struct ip_tunnel *t = netdev_priv(dev);
1167
1168         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1169                 struct flowi fl = { .oif = t->parms.link,
1170                                     .nl_u = { .ip4_u =
1171                                               { .daddr = t->parms.iph.daddr,
1172                                                 .saddr = t->parms.iph.saddr,
1173                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1174                                     .proto = IPPROTO_GRE };
1175                 struct rtable *rt;
1176                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1177                         return -EADDRNOTAVAIL;
1178                 dev = rt->u.dst.dev;
1179                 ip_rt_put(rt);
1180                 if (__in_dev_get_rtnl(dev) == NULL)
1181                         return -EADDRNOTAVAIL;
1182                 t->mlink = dev->ifindex;
1183                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1184         }
1185         return 0;
1186 }
1187
1188 static int ipgre_close(struct net_device *dev)
1189 {
1190         struct ip_tunnel *t = netdev_priv(dev);
1191
1192         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1193                 struct in_device *in_dev;
1194                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1195                 if (in_dev) {
1196                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1197                         in_dev_put(in_dev);
1198                 }
1199         }
1200         return 0;
1201 }
1202
1203 #endif
1204
1205 static const struct net_device_ops ipgre_netdev_ops = {
1206         .ndo_init               = ipgre_tunnel_init,
1207         .ndo_uninit             = ipgre_tunnel_uninit,
1208 #ifdef CONFIG_NET_IPGRE_BROADCAST
1209         .ndo_open               = ipgre_open,
1210         .ndo_stop               = ipgre_close,
1211 #endif
1212         .ndo_start_xmit         = ipgre_tunnel_xmit,
1213         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1214         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1215 };
1216
1217 static void ipgre_tunnel_setup(struct net_device *dev)
1218 {
1219         dev->netdev_ops         = &ipgre_netdev_ops;
1220         dev->destructor         = free_netdev;
1221
1222         dev->type               = ARPHRD_IPGRE;
1223         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1224         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1225         dev->flags              = IFF_NOARP;
1226         dev->iflink             = 0;
1227         dev->addr_len           = 4;
1228         dev->features           |= NETIF_F_NETNS_LOCAL;
1229         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1230 }
1231
1232 static int ipgre_tunnel_init(struct net_device *dev)
1233 {
1234         struct ip_tunnel *tunnel;
1235         struct iphdr *iph;
1236
1237         tunnel = netdev_priv(dev);
1238         iph = &tunnel->parms.iph;
1239
1240         tunnel->dev = dev;
1241         strcpy(tunnel->parms.name, dev->name);
1242
1243         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1244         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1245
1246         if (iph->daddr) {
1247 #ifdef CONFIG_NET_IPGRE_BROADCAST
1248                 if (ipv4_is_multicast(iph->daddr)) {
1249                         if (!iph->saddr)
1250                                 return -EINVAL;
1251                         dev->flags = IFF_BROADCAST;
1252                         dev->header_ops = &ipgre_header_ops;
1253                 }
1254 #endif
1255         } else
1256                 dev->header_ops = &ipgre_header_ops;
1257
1258         return 0;
1259 }
1260
1261 static void ipgre_fb_tunnel_init(struct net_device *dev)
1262 {
1263         struct ip_tunnel *tunnel = netdev_priv(dev);
1264         struct iphdr *iph = &tunnel->parms.iph;
1265         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1266
1267         tunnel->dev = dev;
1268         strcpy(tunnel->parms.name, dev->name);
1269
1270         iph->version            = 4;
1271         iph->protocol           = IPPROTO_GRE;
1272         iph->ihl                = 5;
1273         tunnel->hlen            = sizeof(struct iphdr) + 4;
1274
1275         dev_hold(dev);
1276         ign->tunnels_wc[0]      = tunnel;
1277 }
1278
1279
1280 static const struct net_protocol ipgre_protocol = {
1281         .handler        =       ipgre_rcv,
1282         .err_handler    =       ipgre_err,
1283         .netns_ok       =       1,
1284 };
1285
1286 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1287 {
1288         int prio;
1289
1290         for (prio = 0; prio < 4; prio++) {
1291                 int h;
1292                 for (h = 0; h < HASH_SIZE; h++) {
1293                         struct ip_tunnel *t = ign->tunnels[prio][h];
1294
1295                         while (t != NULL) {
1296                                 unregister_netdevice_queue(t->dev, head);
1297                                 t = t->next;
1298                         }
1299                 }
1300         }
1301 }
1302
1303 static int __net_init ipgre_init_net(struct net *net)
1304 {
1305         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1306         int err;
1307
1308         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1309                                            ipgre_tunnel_setup);
1310         if (!ign->fb_tunnel_dev) {
1311                 err = -ENOMEM;
1312                 goto err_alloc_dev;
1313         }
1314         dev_net_set(ign->fb_tunnel_dev, net);
1315
1316         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1317         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1318
1319         if ((err = register_netdev(ign->fb_tunnel_dev)))
1320                 goto err_reg_dev;
1321
1322         return 0;
1323
1324 err_reg_dev:
1325         free_netdev(ign->fb_tunnel_dev);
1326 err_alloc_dev:
1327         return err;
1328 }
1329
1330 static void __net_exit ipgre_exit_net(struct net *net)
1331 {
1332         struct ipgre_net *ign;
1333         LIST_HEAD(list);
1334
1335         ign = net_generic(net, ipgre_net_id);
1336         rtnl_lock();
1337         ipgre_destroy_tunnels(ign, &list);
1338         unregister_netdevice_many(&list);
1339         rtnl_unlock();
1340 }
1341
1342 static struct pernet_operations ipgre_net_ops = {
1343         .init = ipgre_init_net,
1344         .exit = ipgre_exit_net,
1345         .id   = &ipgre_net_id,
1346         .size = sizeof(struct ipgre_net),
1347 };
1348
1349 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1350 {
1351         __be16 flags;
1352
1353         if (!data)
1354                 return 0;
1355
1356         flags = 0;
1357         if (data[IFLA_GRE_IFLAGS])
1358                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1359         if (data[IFLA_GRE_OFLAGS])
1360                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1361         if (flags & (GRE_VERSION|GRE_ROUTING))
1362                 return -EINVAL;
1363
1364         return 0;
1365 }
1366
1367 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1368 {
1369         __be32 daddr;
1370
1371         if (tb[IFLA_ADDRESS]) {
1372                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1373                         return -EINVAL;
1374                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1375                         return -EADDRNOTAVAIL;
1376         }
1377
1378         if (!data)
1379                 goto out;
1380
1381         if (data[IFLA_GRE_REMOTE]) {
1382                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1383                 if (!daddr)
1384                         return -EINVAL;
1385         }
1386
1387 out:
1388         return ipgre_tunnel_validate(tb, data);
1389 }
1390
1391 static void ipgre_netlink_parms(struct nlattr *data[],
1392                                 struct ip_tunnel_parm *parms)
1393 {
1394         memset(parms, 0, sizeof(*parms));
1395
1396         parms->iph.protocol = IPPROTO_GRE;
1397
1398         if (!data)
1399                 return;
1400
1401         if (data[IFLA_GRE_LINK])
1402                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1403
1404         if (data[IFLA_GRE_IFLAGS])
1405                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1406
1407         if (data[IFLA_GRE_OFLAGS])
1408                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1409
1410         if (data[IFLA_GRE_IKEY])
1411                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1412
1413         if (data[IFLA_GRE_OKEY])
1414                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1415
1416         if (data[IFLA_GRE_LOCAL])
1417                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1418
1419         if (data[IFLA_GRE_REMOTE])
1420                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1421
1422         if (data[IFLA_GRE_TTL])
1423                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1424
1425         if (data[IFLA_GRE_TOS])
1426                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1427
1428         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1429                 parms->iph.frag_off = htons(IP_DF);
1430 }
1431
1432 static int ipgre_tap_init(struct net_device *dev)
1433 {
1434         struct ip_tunnel *tunnel;
1435
1436         tunnel = netdev_priv(dev);
1437
1438         tunnel->dev = dev;
1439         strcpy(tunnel->parms.name, dev->name);
1440
1441         ipgre_tunnel_bind_dev(dev);
1442
1443         return 0;
1444 }
1445
1446 static const struct net_device_ops ipgre_tap_netdev_ops = {
1447         .ndo_init               = ipgre_tap_init,
1448         .ndo_uninit             = ipgre_tunnel_uninit,
1449         .ndo_start_xmit         = ipgre_tunnel_xmit,
1450         .ndo_set_mac_address    = eth_mac_addr,
1451         .ndo_validate_addr      = eth_validate_addr,
1452         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1453 };
1454
1455 static void ipgre_tap_setup(struct net_device *dev)
1456 {
1457
1458         ether_setup(dev);
1459
1460         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1461         dev->destructor         = free_netdev;
1462
1463         dev->iflink             = 0;
1464         dev->features           |= NETIF_F_NETNS_LOCAL;
1465 }
1466
1467 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1468                          struct nlattr *data[])
1469 {
1470         struct ip_tunnel *nt;
1471         struct net *net = dev_net(dev);
1472         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1473         int mtu;
1474         int err;
1475
1476         nt = netdev_priv(dev);
1477         ipgre_netlink_parms(data, &nt->parms);
1478
1479         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1480                 return -EEXIST;
1481
1482         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1483                 random_ether_addr(dev->dev_addr);
1484
1485         mtu = ipgre_tunnel_bind_dev(dev);
1486         if (!tb[IFLA_MTU])
1487                 dev->mtu = mtu;
1488
1489         err = register_netdevice(dev);
1490         if (err)
1491                 goto out;
1492
1493         dev_hold(dev);
1494         ipgre_tunnel_link(ign, nt);
1495
1496 out:
1497         return err;
1498 }
1499
1500 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1501                             struct nlattr *data[])
1502 {
1503         struct ip_tunnel *t, *nt;
1504         struct net *net = dev_net(dev);
1505         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1506         struct ip_tunnel_parm p;
1507         int mtu;
1508
1509         if (dev == ign->fb_tunnel_dev)
1510                 return -EINVAL;
1511
1512         nt = netdev_priv(dev);
1513         ipgre_netlink_parms(data, &p);
1514
1515         t = ipgre_tunnel_locate(net, &p, 0);
1516
1517         if (t) {
1518                 if (t->dev != dev)
1519                         return -EEXIST;
1520         } else {
1521                 t = nt;
1522
1523                 if (dev->type != ARPHRD_ETHER) {
1524                         unsigned nflags = 0;
1525
1526                         if (ipv4_is_multicast(p.iph.daddr))
1527                                 nflags = IFF_BROADCAST;
1528                         else if (p.iph.daddr)
1529                                 nflags = IFF_POINTOPOINT;
1530
1531                         if ((dev->flags ^ nflags) &
1532                             (IFF_POINTOPOINT | IFF_BROADCAST))
1533                                 return -EINVAL;
1534                 }
1535
1536                 ipgre_tunnel_unlink(ign, t);
1537                 t->parms.iph.saddr = p.iph.saddr;
1538                 t->parms.iph.daddr = p.iph.daddr;
1539                 t->parms.i_key = p.i_key;
1540                 if (dev->type != ARPHRD_ETHER) {
1541                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1542                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1543                 }
1544                 ipgre_tunnel_link(ign, t);
1545                 netdev_state_change(dev);
1546         }
1547
1548         t->parms.o_key = p.o_key;
1549         t->parms.iph.ttl = p.iph.ttl;
1550         t->parms.iph.tos = p.iph.tos;
1551         t->parms.iph.frag_off = p.iph.frag_off;
1552
1553         if (t->parms.link != p.link) {
1554                 t->parms.link = p.link;
1555                 mtu = ipgre_tunnel_bind_dev(dev);
1556                 if (!tb[IFLA_MTU])
1557                         dev->mtu = mtu;
1558                 netdev_state_change(dev);
1559         }
1560
1561         return 0;
1562 }
1563
1564 static size_t ipgre_get_size(const struct net_device *dev)
1565 {
1566         return
1567                 /* IFLA_GRE_LINK */
1568                 nla_total_size(4) +
1569                 /* IFLA_GRE_IFLAGS */
1570                 nla_total_size(2) +
1571                 /* IFLA_GRE_OFLAGS */
1572                 nla_total_size(2) +
1573                 /* IFLA_GRE_IKEY */
1574                 nla_total_size(4) +
1575                 /* IFLA_GRE_OKEY */
1576                 nla_total_size(4) +
1577                 /* IFLA_GRE_LOCAL */
1578                 nla_total_size(4) +
1579                 /* IFLA_GRE_REMOTE */
1580                 nla_total_size(4) +
1581                 /* IFLA_GRE_TTL */
1582                 nla_total_size(1) +
1583                 /* IFLA_GRE_TOS */
1584                 nla_total_size(1) +
1585                 /* IFLA_GRE_PMTUDISC */
1586                 nla_total_size(1) +
1587                 0;
1588 }
1589
1590 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1591 {
1592         struct ip_tunnel *t = netdev_priv(dev);
1593         struct ip_tunnel_parm *p = &t->parms;
1594
1595         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1596         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1597         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1598         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1599         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1600         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1601         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1602         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1603         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1604         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1605
1606         return 0;
1607
1608 nla_put_failure:
1609         return -EMSGSIZE;
1610 }
1611
1612 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1613         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1614         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1615         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1616         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1617         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1618         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1619         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1620         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1621         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1622         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1623 };
1624
1625 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1626         .kind           = "gre",
1627         .maxtype        = IFLA_GRE_MAX,
1628         .policy         = ipgre_policy,
1629         .priv_size      = sizeof(struct ip_tunnel),
1630         .setup          = ipgre_tunnel_setup,
1631         .validate       = ipgre_tunnel_validate,
1632         .newlink        = ipgre_newlink,
1633         .changelink     = ipgre_changelink,
1634         .get_size       = ipgre_get_size,
1635         .fill_info      = ipgre_fill_info,
1636 };
1637
1638 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1639         .kind           = "gretap",
1640         .maxtype        = IFLA_GRE_MAX,
1641         .policy         = ipgre_policy,
1642         .priv_size      = sizeof(struct ip_tunnel),
1643         .setup          = ipgre_tap_setup,
1644         .validate       = ipgre_tap_validate,
1645         .newlink        = ipgre_newlink,
1646         .changelink     = ipgre_changelink,
1647         .get_size       = ipgre_get_size,
1648         .fill_info      = ipgre_fill_info,
1649 };
1650
1651 /*
1652  *      And now the modules code and kernel interface.
1653  */
1654
1655 static int __init ipgre_init(void)
1656 {
1657         int err;
1658
1659         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1660
1661         err = register_pernet_device(&ipgre_net_ops);
1662         if (err < 0)
1663                 return err;
1664
1665         err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1666         if (err < 0) {
1667                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1668                 goto add_proto_failed;
1669         }
1670
1671         err = rtnl_link_register(&ipgre_link_ops);
1672         if (err < 0)
1673                 goto rtnl_link_failed;
1674
1675         err = rtnl_link_register(&ipgre_tap_ops);
1676         if (err < 0)
1677                 goto tap_ops_failed;
1678
1679 out:
1680         return err;
1681
1682 tap_ops_failed:
1683         rtnl_link_unregister(&ipgre_link_ops);
1684 rtnl_link_failed:
1685         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1686 add_proto_failed:
1687         unregister_pernet_device(&ipgre_net_ops);
1688         goto out;
1689 }
1690
1691 static void __exit ipgre_fini(void)
1692 {
1693         rtnl_link_unregister(&ipgre_tap_ops);
1694         rtnl_link_unregister(&ipgre_link_ops);
1695         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1696                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1697         unregister_pernet_device(&ipgre_net_ops);
1698 }
1699
1700 module_init(ipgre_init);
1701 module_exit(ipgre_fini);
1702 MODULE_LICENSE("GPL");
1703 MODULE_ALIAS_RTNL_LINK("gre");
1704 MODULE_ALIAS_RTNL_LINK("gretap");