]> rtime.felk.cvut.cz Git - can-eth-gw-linux.git/blob - net/ipv4/ip_gre.c
6b3ca5ba4450599e15ebe8b3ed5597a51051821f
[can-eth-gw-linux.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170         unsigned long   rx_packets;
171         unsigned long   rx_bytes;
172         unsigned long   tx_packets;
173         unsigned long   tx_bytes;
174 } __attribute__((aligned(4*sizeof(unsigned long))));
175
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178         struct pcpu_tstats sum = { 0 };
179         int i;
180
181         for_each_possible_cpu(i) {
182                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184                 sum.rx_packets += tstats->rx_packets;
185                 sum.rx_bytes   += tstats->rx_bytes;
186                 sum.tx_packets += tstats->tx_packets;
187                 sum.tx_bytes   += tstats->tx_bytes;
188         }
189         dev->stats.rx_packets = sum.rx_packets;
190         dev->stats.rx_bytes   = sum.rx_bytes;
191         dev->stats.tx_packets = sum.tx_packets;
192         dev->stats.tx_bytes   = sum.tx_bytes;
193         return &dev->stats;
194 }
195
196 /* Given src, dst and key, find appropriate for input tunnel. */
197
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199                                               __be32 remote, __be32 local,
200                                               __be32 key, __be16 gre_proto)
201 {
202         struct net *net = dev_net(dev);
203         int link = dev->ifindex;
204         unsigned int h0 = HASH(remote);
205         unsigned int h1 = HASH(key);
206         struct ip_tunnel *t, *cand = NULL;
207         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209                        ARPHRD_ETHER : ARPHRD_IPGRE;
210         int score, cand_score = 4;
211
212         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213                 if (local != t->parms.iph.saddr ||
214                     remote != t->parms.iph.daddr ||
215                     key != t->parms.i_key ||
216                     !(t->dev->flags & IFF_UP))
217                         continue;
218
219                 if (t->dev->type != ARPHRD_IPGRE &&
220                     t->dev->type != dev_type)
221                         continue;
222
223                 score = 0;
224                 if (t->parms.link != link)
225                         score |= 1;
226                 if (t->dev->type != dev_type)
227                         score |= 2;
228                 if (score == 0)
229                         return t;
230
231                 if (score < cand_score) {
232                         cand = t;
233                         cand_score = score;
234                 }
235         }
236
237         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238                 if (remote != t->parms.iph.daddr ||
239                     key != t->parms.i_key ||
240                     !(t->dev->flags & IFF_UP))
241                         continue;
242
243                 if (t->dev->type != ARPHRD_IPGRE &&
244                     t->dev->type != dev_type)
245                         continue;
246
247                 score = 0;
248                 if (t->parms.link != link)
249                         score |= 1;
250                 if (t->dev->type != dev_type)
251                         score |= 2;
252                 if (score == 0)
253                         return t;
254
255                 if (score < cand_score) {
256                         cand = t;
257                         cand_score = score;
258                 }
259         }
260
261         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262                 if ((local != t->parms.iph.saddr &&
263                      (local != t->parms.iph.daddr ||
264                       !ipv4_is_multicast(local))) ||
265                     key != t->parms.i_key ||
266                     !(t->dev->flags & IFF_UP))
267                         continue;
268
269                 if (t->dev->type != ARPHRD_IPGRE &&
270                     t->dev->type != dev_type)
271                         continue;
272
273                 score = 0;
274                 if (t->parms.link != link)
275                         score |= 1;
276                 if (t->dev->type != dev_type)
277                         score |= 2;
278                 if (score == 0)
279                         return t;
280
281                 if (score < cand_score) {
282                         cand = t;
283                         cand_score = score;
284                 }
285         }
286
287         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288                 if (t->parms.i_key != key ||
289                     !(t->dev->flags & IFF_UP))
290                         continue;
291
292                 if (t->dev->type != ARPHRD_IPGRE &&
293                     t->dev->type != dev_type)
294                         continue;
295
296                 score = 0;
297                 if (t->parms.link != link)
298                         score |= 1;
299                 if (t->dev->type != dev_type)
300                         score |= 2;
301                 if (score == 0)
302                         return t;
303
304                 if (score < cand_score) {
305                         cand = t;
306                         cand_score = score;
307                 }
308         }
309
310         if (cand != NULL)
311                 return cand;
312
313         dev = ign->fb_tunnel_dev;
314         if (dev->flags & IFF_UP)
315                 return netdev_priv(dev);
316
317         return NULL;
318 }
319
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321                 struct ip_tunnel_parm *parms)
322 {
323         __be32 remote = parms->iph.daddr;
324         __be32 local = parms->iph.saddr;
325         __be32 key = parms->i_key;
326         unsigned int h = HASH(key);
327         int prio = 0;
328
329         if (local)
330                 prio |= 1;
331         if (remote && !ipv4_is_multicast(remote)) {
332                 prio |= 2;
333                 h ^= HASH(remote);
334         }
335
336         return &ign->tunnels[prio][h];
337 }
338
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340                 struct ip_tunnel *t)
341 {
342         return __ipgre_bucket(ign, &t->parms);
343 }
344
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348
349         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350         rcu_assign_pointer(*tp, t);
351 }
352
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355         struct ip_tunnel __rcu **tp;
356         struct ip_tunnel *iter;
357
358         for (tp = ipgre_bucket(ign, t);
359              (iter = rtnl_dereference(*tp)) != NULL;
360              tp = &iter->next) {
361                 if (t == iter) {
362                         rcu_assign_pointer(*tp, t->next);
363                         break;
364                 }
365         }
366 }
367
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369                                            struct ip_tunnel_parm *parms,
370                                            int type)
371 {
372         __be32 remote = parms->iph.daddr;
373         __be32 local = parms->iph.saddr;
374         __be32 key = parms->i_key;
375         int link = parms->link;
376         struct ip_tunnel *t;
377         struct ip_tunnel __rcu **tp;
378         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379
380         for (tp = __ipgre_bucket(ign, parms);
381              (t = rtnl_dereference(*tp)) != NULL;
382              tp = &t->next)
383                 if (local == t->parms.iph.saddr &&
384                     remote == t->parms.iph.daddr &&
385                     key == t->parms.i_key &&
386                     link == t->parms.link &&
387                     type == t->dev->type)
388                         break;
389
390         return t;
391 }
392
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394                 struct ip_tunnel_parm *parms, int create)
395 {
396         struct ip_tunnel *t, *nt;
397         struct net_device *dev;
398         char name[IFNAMSIZ];
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402         if (t || !create)
403                 return t;
404
405         if (parms->name[0])
406                 strlcpy(name, parms->name, IFNAMSIZ);
407         else
408                 strcpy(name, "gre%d");
409
410         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411         if (!dev)
412                 return NULL;
413
414         dev_net_set(dev, net);
415
416         nt = netdev_priv(dev);
417         nt->parms = *parms;
418         dev->rtnl_link_ops = &ipgre_link_ops;
419
420         dev->mtu = ipgre_tunnel_bind_dev(dev);
421
422         if (register_netdevice(dev) < 0)
423                 goto failed_free;
424
425         /* Can use a lockless transmit, unless we generate output sequences */
426         if (!(nt->parms.o_flags & GRE_SEQ))
427                 dev->features |= NETIF_F_LLTX;
428
429         dev_hold(dev);
430         ipgre_tunnel_link(ign, nt);
431         return nt;
432
433 failed_free:
434         free_netdev(dev);
435         return NULL;
436 }
437
438 static void ipgre_tunnel_uninit(struct net_device *dev)
439 {
440         struct net *net = dev_net(dev);
441         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
442
443         ipgre_tunnel_unlink(ign, netdev_priv(dev));
444         dev_put(dev);
445 }
446
447
448 static void ipgre_err(struct sk_buff *skb, u32 info)
449 {
450
451 /* All the routers (except for Linux) return only
452    8 bytes of packet payload. It means, that precise relaying of
453    ICMP in the real Internet is absolutely infeasible.
454
455    Moreover, Cisco "wise men" put GRE key to the third word
456    in GRE header. It makes impossible maintaining even soft state for keyed
457    GRE tunnels with enabled checksum. Tell them "thank you".
458
459    Well, I wonder, rfc1812 was written by Cisco employee,
460    what the hell these idiots break standrads established
461    by themself???
462  */
463
464         const struct iphdr *iph = (const struct iphdr *)skb->data;
465         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
466         int grehlen = (iph->ihl<<2) + 4;
467         const int type = icmp_hdr(skb)->type;
468         const int code = icmp_hdr(skb)->code;
469         struct ip_tunnel *t;
470         __be16 flags;
471
472         flags = p[0];
473         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
474                 if (flags&(GRE_VERSION|GRE_ROUTING))
475                         return;
476                 if (flags&GRE_KEY) {
477                         grehlen += 4;
478                         if (flags&GRE_CSUM)
479                                 grehlen += 4;
480                 }
481         }
482
483         /* If only 8 bytes returned, keyed message will be dropped here */
484         if (skb_headlen(skb) < grehlen)
485                 return;
486
487         switch (type) {
488         default:
489         case ICMP_PARAMETERPROB:
490                 return;
491
492         case ICMP_DEST_UNREACH:
493                 switch (code) {
494                 case ICMP_SR_FAILED:
495                 case ICMP_PORT_UNREACH:
496                         /* Impossible event. */
497                         return;
498                 case ICMP_FRAG_NEEDED:
499                         /* Soft state for pmtu is maintained by IP core. */
500                         return;
501                 default:
502                         /* All others are translated to HOST_UNREACH.
503                            rfc2003 contains "deep thoughts" about NET_UNREACH,
504                            I believe they are just ether pollution. --ANK
505                          */
506                         break;
507                 }
508                 break;
509         case ICMP_TIME_EXCEEDED:
510                 if (code != ICMP_EXC_TTL)
511                         return;
512                 break;
513         }
514
515         rcu_read_lock();
516         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
517                                 flags & GRE_KEY ?
518                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
519                                 p[1]);
520         if (t == NULL || t->parms.iph.daddr == 0 ||
521             ipv4_is_multicast(t->parms.iph.daddr))
522                 goto out;
523
524         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
525                 goto out;
526
527         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
528                 t->err_count++;
529         else
530                 t->err_count = 1;
531         t->err_time = jiffies;
532 out:
533         rcu_read_unlock();
534 }
535
536 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
537 {
538         if (INET_ECN_is_ce(iph->tos)) {
539                 if (skb->protocol == htons(ETH_P_IP)) {
540                         IP_ECN_set_ce(ip_hdr(skb));
541                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
542                         IP6_ECN_set_ce(ipv6_hdr(skb));
543                 }
544         }
545 }
546
547 static inline u8
548 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
549 {
550         u8 inner = 0;
551         if (skb->protocol == htons(ETH_P_IP))
552                 inner = old_iph->tos;
553         else if (skb->protocol == htons(ETH_P_IPV6))
554                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
555         return INET_ECN_encapsulate(tos, inner);
556 }
557
558 static int ipgre_rcv(struct sk_buff *skb)
559 {
560         const struct iphdr *iph;
561         u8     *h;
562         __be16    flags;
563         __sum16   csum = 0;
564         __be32 key = 0;
565         u32    seqno = 0;
566         struct ip_tunnel *tunnel;
567         int    offset = 4;
568         __be16 gre_proto;
569
570         if (!pskb_may_pull(skb, 16))
571                 goto drop_nolock;
572
573         iph = ip_hdr(skb);
574         h = skb->data;
575         flags = *(__be16*)h;
576
577         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
578                 /* - Version must be 0.
579                    - We do not support routing headers.
580                  */
581                 if (flags&(GRE_VERSION|GRE_ROUTING))
582                         goto drop_nolock;
583
584                 if (flags&GRE_CSUM) {
585                         switch (skb->ip_summed) {
586                         case CHECKSUM_COMPLETE:
587                                 csum = csum_fold(skb->csum);
588                                 if (!csum)
589                                         break;
590                                 /* fall through */
591                         case CHECKSUM_NONE:
592                                 skb->csum = 0;
593                                 csum = __skb_checksum_complete(skb);
594                                 skb->ip_summed = CHECKSUM_COMPLETE;
595                         }
596                         offset += 4;
597                 }
598                 if (flags&GRE_KEY) {
599                         key = *(__be32*)(h + offset);
600                         offset += 4;
601                 }
602                 if (flags&GRE_SEQ) {
603                         seqno = ntohl(*(__be32*)(h + offset));
604                         offset += 4;
605                 }
606         }
607
608         gre_proto = *(__be16 *)(h + 2);
609
610         rcu_read_lock();
611         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
612                                           iph->saddr, iph->daddr, key,
613                                           gre_proto))) {
614                 struct pcpu_tstats *tstats;
615
616                 secpath_reset(skb);
617
618                 skb->protocol = gre_proto;
619                 /* WCCP version 1 and 2 protocol decoding.
620                  * - Change protocol to IP
621                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
622                  */
623                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
624                         skb->protocol = htons(ETH_P_IP);
625                         if ((*(h + offset) & 0xF0) != 0x40)
626                                 offset += 4;
627                 }
628
629                 skb->mac_header = skb->network_header;
630                 __pskb_pull(skb, offset);
631                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
632                 skb->pkt_type = PACKET_HOST;
633 #ifdef CONFIG_NET_IPGRE_BROADCAST
634                 if (ipv4_is_multicast(iph->daddr)) {
635                         /* Looped back packet, drop it! */
636                         if (rt_is_output_route(skb_rtable(skb)))
637                                 goto drop;
638                         tunnel->dev->stats.multicast++;
639                         skb->pkt_type = PACKET_BROADCAST;
640                 }
641 #endif
642
643                 if (((flags&GRE_CSUM) && csum) ||
644                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
645                         tunnel->dev->stats.rx_crc_errors++;
646                         tunnel->dev->stats.rx_errors++;
647                         goto drop;
648                 }
649                 if (tunnel->parms.i_flags&GRE_SEQ) {
650                         if (!(flags&GRE_SEQ) ||
651                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
652                                 tunnel->dev->stats.rx_fifo_errors++;
653                                 tunnel->dev->stats.rx_errors++;
654                                 goto drop;
655                         }
656                         tunnel->i_seqno = seqno + 1;
657                 }
658
659                 /* Warning: All skb pointers will be invalidated! */
660                 if (tunnel->dev->type == ARPHRD_ETHER) {
661                         if (!pskb_may_pull(skb, ETH_HLEN)) {
662                                 tunnel->dev->stats.rx_length_errors++;
663                                 tunnel->dev->stats.rx_errors++;
664                                 goto drop;
665                         }
666
667                         iph = ip_hdr(skb);
668                         skb->protocol = eth_type_trans(skb, tunnel->dev);
669                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
670                 }
671
672                 tstats = this_cpu_ptr(tunnel->dev->tstats);
673                 tstats->rx_packets++;
674                 tstats->rx_bytes += skb->len;
675
676                 __skb_tunnel_rx(skb, tunnel->dev);
677
678                 skb_reset_network_header(skb);
679                 ipgre_ecn_decapsulate(iph, skb);
680
681                 netif_rx(skb);
682
683                 rcu_read_unlock();
684                 return 0;
685         }
686         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
687
688 drop:
689         rcu_read_unlock();
690 drop_nolock:
691         kfree_skb(skb);
692         return 0;
693 }
694
695 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
696 {
697         struct ip_tunnel *tunnel = netdev_priv(dev);
698         struct pcpu_tstats *tstats;
699         const struct iphdr  *old_iph = ip_hdr(skb);
700         const struct iphdr  *tiph;
701         struct flowi4 fl4;
702         u8     tos;
703         __be16 df;
704         struct rtable *rt;                      /* Route to the other host */
705         struct net_device *tdev;                /* Device to other host */
706         struct iphdr  *iph;                     /* Our new IP header */
707         unsigned int max_headroom;              /* The extra header space needed */
708         int    gre_hlen;
709         __be32 dst;
710         int    mtu;
711
712         if (dev->type == ARPHRD_ETHER)
713                 IPCB(skb)->flags = 0;
714
715         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
716                 gre_hlen = 0;
717                 tiph = (const struct iphdr *)skb->data;
718         } else {
719                 gre_hlen = tunnel->hlen;
720                 tiph = &tunnel->parms.iph;
721         }
722
723         if ((dst = tiph->daddr) == 0) {
724                 /* NBMA tunnel */
725
726                 if (skb_dst(skb) == NULL) {
727                         dev->stats.tx_fifo_errors++;
728                         goto tx_error;
729                 }
730
731                 if (skb->protocol == htons(ETH_P_IP)) {
732                         rt = skb_rtable(skb);
733                         if ((dst = rt->rt_gateway) == 0)
734                                 goto tx_error_icmp;
735                 }
736 #if IS_ENABLED(CONFIG_IPV6)
737                 else if (skb->protocol == htons(ETH_P_IPV6)) {
738                         struct neighbour *neigh = dst_get_neighbour_noref(skb_dst(skb));
739                         const struct in6_addr *addr6;
740                         int addr_type;
741
742                         if (neigh == NULL)
743                                 goto tx_error;
744
745                         addr6 = (const struct in6_addr *)&neigh->primary_key;
746                         addr_type = ipv6_addr_type(addr6);
747
748                         if (addr_type == IPV6_ADDR_ANY) {
749                                 addr6 = &ipv6_hdr(skb)->daddr;
750                                 addr_type = ipv6_addr_type(addr6);
751                         }
752
753                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
754                                 goto tx_error_icmp;
755
756                         dst = addr6->s6_addr32[3];
757                 }
758 #endif
759                 else
760                         goto tx_error;
761         }
762
763         tos = tiph->tos;
764         if (tos == 1) {
765                 tos = 0;
766                 if (skb->protocol == htons(ETH_P_IP))
767                         tos = old_iph->tos;
768                 else if (skb->protocol == htons(ETH_P_IPV6))
769                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
770         }
771
772         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
773                                  tunnel->parms.o_key, RT_TOS(tos),
774                                  tunnel->parms.link);
775         if (IS_ERR(rt)) {
776                 dev->stats.tx_carrier_errors++;
777                 goto tx_error;
778         }
779         tdev = rt->dst.dev;
780
781         if (tdev == dev) {
782                 ip_rt_put(rt);
783                 dev->stats.collisions++;
784                 goto tx_error;
785         }
786
787         df = tiph->frag_off;
788         if (df)
789                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
790         else
791                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
792
793         if (skb_dst(skb))
794                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
795
796         if (skb->protocol == htons(ETH_P_IP)) {
797                 df |= (old_iph->frag_off&htons(IP_DF));
798
799                 if ((old_iph->frag_off&htons(IP_DF)) &&
800                     mtu < ntohs(old_iph->tot_len)) {
801                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
802                         ip_rt_put(rt);
803                         goto tx_error;
804                 }
805         }
806 #if IS_ENABLED(CONFIG_IPV6)
807         else if (skb->protocol == htons(ETH_P_IPV6)) {
808                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
809
810                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
811                         if ((tunnel->parms.iph.daddr &&
812                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
813                             rt6->rt6i_dst.plen == 128) {
814                                 rt6->rt6i_flags |= RTF_MODIFIED;
815                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
816                         }
817                 }
818
819                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
820                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
821                         ip_rt_put(rt);
822                         goto tx_error;
823                 }
824         }
825 #endif
826
827         if (tunnel->err_count > 0) {
828                 if (time_before(jiffies,
829                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
830                         tunnel->err_count--;
831
832                         dst_link_failure(skb);
833                 } else
834                         tunnel->err_count = 0;
835         }
836
837         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
838
839         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
840             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
841                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
842                 if (max_headroom > dev->needed_headroom)
843                         dev->needed_headroom = max_headroom;
844                 if (!new_skb) {
845                         ip_rt_put(rt);
846                         dev->stats.tx_dropped++;
847                         dev_kfree_skb(skb);
848                         return NETDEV_TX_OK;
849                 }
850                 if (skb->sk)
851                         skb_set_owner_w(new_skb, skb->sk);
852                 dev_kfree_skb(skb);
853                 skb = new_skb;
854                 old_iph = ip_hdr(skb);
855         }
856
857         skb_reset_transport_header(skb);
858         skb_push(skb, gre_hlen);
859         skb_reset_network_header(skb);
860         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
861         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
862                               IPSKB_REROUTED);
863         skb_dst_drop(skb);
864         skb_dst_set(skb, &rt->dst);
865
866         /*
867          *      Push down and install the IPIP header.
868          */
869
870         iph                     =       ip_hdr(skb);
871         iph->version            =       4;
872         iph->ihl                =       sizeof(struct iphdr) >> 2;
873         iph->frag_off           =       df;
874         iph->protocol           =       IPPROTO_GRE;
875         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
876         iph->daddr              =       fl4.daddr;
877         iph->saddr              =       fl4.saddr;
878
879         if ((iph->ttl = tiph->ttl) == 0) {
880                 if (skb->protocol == htons(ETH_P_IP))
881                         iph->ttl = old_iph->ttl;
882 #if IS_ENABLED(CONFIG_IPV6)
883                 else if (skb->protocol == htons(ETH_P_IPV6))
884                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
885 #endif
886                 else
887                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
888         }
889
890         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
891         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
892                                    htons(ETH_P_TEB) : skb->protocol;
893
894         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
895                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
896
897                 if (tunnel->parms.o_flags&GRE_SEQ) {
898                         ++tunnel->o_seqno;
899                         *ptr = htonl(tunnel->o_seqno);
900                         ptr--;
901                 }
902                 if (tunnel->parms.o_flags&GRE_KEY) {
903                         *ptr = tunnel->parms.o_key;
904                         ptr--;
905                 }
906                 if (tunnel->parms.o_flags&GRE_CSUM) {
907                         *ptr = 0;
908                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
909                 }
910         }
911
912         nf_reset(skb);
913         tstats = this_cpu_ptr(dev->tstats);
914         __IPTUNNEL_XMIT(tstats, &dev->stats);
915         return NETDEV_TX_OK;
916
917 tx_error_icmp:
918         dst_link_failure(skb);
919
920 tx_error:
921         dev->stats.tx_errors++;
922         dev_kfree_skb(skb);
923         return NETDEV_TX_OK;
924 }
925
926 static int ipgre_tunnel_bind_dev(struct net_device *dev)
927 {
928         struct net_device *tdev = NULL;
929         struct ip_tunnel *tunnel;
930         const struct iphdr *iph;
931         int hlen = LL_MAX_HEADER;
932         int mtu = ETH_DATA_LEN;
933         int addend = sizeof(struct iphdr) + 4;
934
935         tunnel = netdev_priv(dev);
936         iph = &tunnel->parms.iph;
937
938         /* Guess output device to choose reasonable mtu and needed_headroom */
939
940         if (iph->daddr) {
941                 struct flowi4 fl4;
942                 struct rtable *rt;
943
944                 rt = ip_route_output_gre(dev_net(dev), &fl4,
945                                          iph->daddr, iph->saddr,
946                                          tunnel->parms.o_key,
947                                          RT_TOS(iph->tos),
948                                          tunnel->parms.link);
949                 if (!IS_ERR(rt)) {
950                         tdev = rt->dst.dev;
951                         ip_rt_put(rt);
952                 }
953
954                 if (dev->type != ARPHRD_ETHER)
955                         dev->flags |= IFF_POINTOPOINT;
956         }
957
958         if (!tdev && tunnel->parms.link)
959                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
960
961         if (tdev) {
962                 hlen = tdev->hard_header_len + tdev->needed_headroom;
963                 mtu = tdev->mtu;
964         }
965         dev->iflink = tunnel->parms.link;
966
967         /* Precalculate GRE options length */
968         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
969                 if (tunnel->parms.o_flags&GRE_CSUM)
970                         addend += 4;
971                 if (tunnel->parms.o_flags&GRE_KEY)
972                         addend += 4;
973                 if (tunnel->parms.o_flags&GRE_SEQ)
974                         addend += 4;
975         }
976         dev->needed_headroom = addend + hlen;
977         mtu -= dev->hard_header_len + addend;
978
979         if (mtu < 68)
980                 mtu = 68;
981
982         tunnel->hlen = addend;
983
984         return mtu;
985 }
986
987 static int
988 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
989 {
990         int err = 0;
991         struct ip_tunnel_parm p;
992         struct ip_tunnel *t;
993         struct net *net = dev_net(dev);
994         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
995
996         switch (cmd) {
997         case SIOCGETTUNNEL:
998                 t = NULL;
999                 if (dev == ign->fb_tunnel_dev) {
1000                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1001                                 err = -EFAULT;
1002                                 break;
1003                         }
1004                         t = ipgre_tunnel_locate(net, &p, 0);
1005                 }
1006                 if (t == NULL)
1007                         t = netdev_priv(dev);
1008                 memcpy(&p, &t->parms, sizeof(p));
1009                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1010                         err = -EFAULT;
1011                 break;
1012
1013         case SIOCADDTUNNEL:
1014         case SIOCCHGTUNNEL:
1015                 err = -EPERM;
1016                 if (!capable(CAP_NET_ADMIN))
1017                         goto done;
1018
1019                 err = -EFAULT;
1020                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1021                         goto done;
1022
1023                 err = -EINVAL;
1024                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1025                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1026                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1027                         goto done;
1028                 if (p.iph.ttl)
1029                         p.iph.frag_off |= htons(IP_DF);
1030
1031                 if (!(p.i_flags&GRE_KEY))
1032                         p.i_key = 0;
1033                 if (!(p.o_flags&GRE_KEY))
1034                         p.o_key = 0;
1035
1036                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1037
1038                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1039                         if (t != NULL) {
1040                                 if (t->dev != dev) {
1041                                         err = -EEXIST;
1042                                         break;
1043                                 }
1044                         } else {
1045                                 unsigned int nflags = 0;
1046
1047                                 t = netdev_priv(dev);
1048
1049                                 if (ipv4_is_multicast(p.iph.daddr))
1050                                         nflags = IFF_BROADCAST;
1051                                 else if (p.iph.daddr)
1052                                         nflags = IFF_POINTOPOINT;
1053
1054                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1055                                         err = -EINVAL;
1056                                         break;
1057                                 }
1058                                 ipgre_tunnel_unlink(ign, t);
1059                                 synchronize_net();
1060                                 t->parms.iph.saddr = p.iph.saddr;
1061                                 t->parms.iph.daddr = p.iph.daddr;
1062                                 t->parms.i_key = p.i_key;
1063                                 t->parms.o_key = p.o_key;
1064                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1065                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1066                                 ipgre_tunnel_link(ign, t);
1067                                 netdev_state_change(dev);
1068                         }
1069                 }
1070
1071                 if (t) {
1072                         err = 0;
1073                         if (cmd == SIOCCHGTUNNEL) {
1074                                 t->parms.iph.ttl = p.iph.ttl;
1075                                 t->parms.iph.tos = p.iph.tos;
1076                                 t->parms.iph.frag_off = p.iph.frag_off;
1077                                 if (t->parms.link != p.link) {
1078                                         t->parms.link = p.link;
1079                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1080                                         netdev_state_change(dev);
1081                                 }
1082                         }
1083                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1084                                 err = -EFAULT;
1085                 } else
1086                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1087                 break;
1088
1089         case SIOCDELTUNNEL:
1090                 err = -EPERM;
1091                 if (!capable(CAP_NET_ADMIN))
1092                         goto done;
1093
1094                 if (dev == ign->fb_tunnel_dev) {
1095                         err = -EFAULT;
1096                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1097                                 goto done;
1098                         err = -ENOENT;
1099                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1100                                 goto done;
1101                         err = -EPERM;
1102                         if (t == netdev_priv(ign->fb_tunnel_dev))
1103                                 goto done;
1104                         dev = t->dev;
1105                 }
1106                 unregister_netdevice(dev);
1107                 err = 0;
1108                 break;
1109
1110         default:
1111                 err = -EINVAL;
1112         }
1113
1114 done:
1115         return err;
1116 }
1117
1118 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1119 {
1120         struct ip_tunnel *tunnel = netdev_priv(dev);
1121         if (new_mtu < 68 ||
1122             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1123                 return -EINVAL;
1124         dev->mtu = new_mtu;
1125         return 0;
1126 }
1127
1128 /* Nice toy. Unfortunately, useless in real life :-)
1129    It allows to construct virtual multiprotocol broadcast "LAN"
1130    over the Internet, provided multicast routing is tuned.
1131
1132
1133    I have no idea was this bicycle invented before me,
1134    so that I had to set ARPHRD_IPGRE to a random value.
1135    I have an impression, that Cisco could make something similar,
1136    but this feature is apparently missing in IOS<=11.2(8).
1137
1138    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1139    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1140
1141    ping -t 255 224.66.66.66
1142
1143    If nobody answers, mbone does not work.
1144
1145    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1146    ip addr add 10.66.66.<somewhat>/24 dev Universe
1147    ifconfig Universe up
1148    ifconfig Universe add fe80::<Your_real_addr>/10
1149    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1150    ftp 10.66.66.66
1151    ...
1152    ftp fec0:6666:6666::193.233.7.65
1153    ...
1154
1155  */
1156
1157 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1158                         unsigned short type,
1159                         const void *daddr, const void *saddr, unsigned int len)
1160 {
1161         struct ip_tunnel *t = netdev_priv(dev);
1162         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1163         __be16 *p = (__be16*)(iph+1);
1164
1165         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1166         p[0]            = t->parms.o_flags;
1167         p[1]            = htons(type);
1168
1169         /*
1170          *      Set the source hardware address.
1171          */
1172
1173         if (saddr)
1174                 memcpy(&iph->saddr, saddr, 4);
1175         if (daddr)
1176                 memcpy(&iph->daddr, daddr, 4);
1177         if (iph->daddr)
1178                 return t->hlen;
1179
1180         return -t->hlen;
1181 }
1182
1183 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1184 {
1185         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1186         memcpy(haddr, &iph->saddr, 4);
1187         return 4;
1188 }
1189
1190 static const struct header_ops ipgre_header_ops = {
1191         .create = ipgre_header,
1192         .parse  = ipgre_header_parse,
1193 };
1194
1195 #ifdef CONFIG_NET_IPGRE_BROADCAST
1196 static int ipgre_open(struct net_device *dev)
1197 {
1198         struct ip_tunnel *t = netdev_priv(dev);
1199
1200         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1201                 struct flowi4 fl4;
1202                 struct rtable *rt;
1203
1204                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1205                                          t->parms.iph.daddr,
1206                                          t->parms.iph.saddr,
1207                                          t->parms.o_key,
1208                                          RT_TOS(t->parms.iph.tos),
1209                                          t->parms.link);
1210                 if (IS_ERR(rt))
1211                         return -EADDRNOTAVAIL;
1212                 dev = rt->dst.dev;
1213                 ip_rt_put(rt);
1214                 if (__in_dev_get_rtnl(dev) == NULL)
1215                         return -EADDRNOTAVAIL;
1216                 t->mlink = dev->ifindex;
1217                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1218         }
1219         return 0;
1220 }
1221
1222 static int ipgre_close(struct net_device *dev)
1223 {
1224         struct ip_tunnel *t = netdev_priv(dev);
1225
1226         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1227                 struct in_device *in_dev;
1228                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1229                 if (in_dev)
1230                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1231         }
1232         return 0;
1233 }
1234
1235 #endif
1236
1237 static const struct net_device_ops ipgre_netdev_ops = {
1238         .ndo_init               = ipgre_tunnel_init,
1239         .ndo_uninit             = ipgre_tunnel_uninit,
1240 #ifdef CONFIG_NET_IPGRE_BROADCAST
1241         .ndo_open               = ipgre_open,
1242         .ndo_stop               = ipgre_close,
1243 #endif
1244         .ndo_start_xmit         = ipgre_tunnel_xmit,
1245         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1246         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1247         .ndo_get_stats          = ipgre_get_stats,
1248 };
1249
1250 static void ipgre_dev_free(struct net_device *dev)
1251 {
1252         free_percpu(dev->tstats);
1253         free_netdev(dev);
1254 }
1255
1256 static void ipgre_tunnel_setup(struct net_device *dev)
1257 {
1258         dev->netdev_ops         = &ipgre_netdev_ops;
1259         dev->destructor         = ipgre_dev_free;
1260
1261         dev->type               = ARPHRD_IPGRE;
1262         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1263         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1264         dev->flags              = IFF_NOARP;
1265         dev->iflink             = 0;
1266         dev->addr_len           = 4;
1267         dev->features           |= NETIF_F_NETNS_LOCAL;
1268         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1269 }
1270
1271 static int ipgre_tunnel_init(struct net_device *dev)
1272 {
1273         struct ip_tunnel *tunnel;
1274         struct iphdr *iph;
1275
1276         tunnel = netdev_priv(dev);
1277         iph = &tunnel->parms.iph;
1278
1279         tunnel->dev = dev;
1280         strcpy(tunnel->parms.name, dev->name);
1281
1282         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1283         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1284
1285         if (iph->daddr) {
1286 #ifdef CONFIG_NET_IPGRE_BROADCAST
1287                 if (ipv4_is_multicast(iph->daddr)) {
1288                         if (!iph->saddr)
1289                                 return -EINVAL;
1290                         dev->flags = IFF_BROADCAST;
1291                         dev->header_ops = &ipgre_header_ops;
1292                 }
1293 #endif
1294         } else
1295                 dev->header_ops = &ipgre_header_ops;
1296
1297         dev->tstats = alloc_percpu(struct pcpu_tstats);
1298         if (!dev->tstats)
1299                 return -ENOMEM;
1300
1301         return 0;
1302 }
1303
1304 static void ipgre_fb_tunnel_init(struct net_device *dev)
1305 {
1306         struct ip_tunnel *tunnel = netdev_priv(dev);
1307         struct iphdr *iph = &tunnel->parms.iph;
1308
1309         tunnel->dev = dev;
1310         strcpy(tunnel->parms.name, dev->name);
1311
1312         iph->version            = 4;
1313         iph->protocol           = IPPROTO_GRE;
1314         iph->ihl                = 5;
1315         tunnel->hlen            = sizeof(struct iphdr) + 4;
1316
1317         dev_hold(dev);
1318 }
1319
1320
1321 static const struct gre_protocol ipgre_protocol = {
1322         .handler     = ipgre_rcv,
1323         .err_handler = ipgre_err,
1324 };
1325
1326 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1327 {
1328         int prio;
1329
1330         for (prio = 0; prio < 4; prio++) {
1331                 int h;
1332                 for (h = 0; h < HASH_SIZE; h++) {
1333                         struct ip_tunnel *t;
1334
1335                         t = rtnl_dereference(ign->tunnels[prio][h]);
1336
1337                         while (t != NULL) {
1338                                 unregister_netdevice_queue(t->dev, head);
1339                                 t = rtnl_dereference(t->next);
1340                         }
1341                 }
1342         }
1343 }
1344
1345 static int __net_init ipgre_init_net(struct net *net)
1346 {
1347         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1348         int err;
1349
1350         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1351                                            ipgre_tunnel_setup);
1352         if (!ign->fb_tunnel_dev) {
1353                 err = -ENOMEM;
1354                 goto err_alloc_dev;
1355         }
1356         dev_net_set(ign->fb_tunnel_dev, net);
1357
1358         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1359         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1360
1361         if ((err = register_netdev(ign->fb_tunnel_dev)))
1362                 goto err_reg_dev;
1363
1364         rcu_assign_pointer(ign->tunnels_wc[0],
1365                            netdev_priv(ign->fb_tunnel_dev));
1366         return 0;
1367
1368 err_reg_dev:
1369         ipgre_dev_free(ign->fb_tunnel_dev);
1370 err_alloc_dev:
1371         return err;
1372 }
1373
1374 static void __net_exit ipgre_exit_net(struct net *net)
1375 {
1376         struct ipgre_net *ign;
1377         LIST_HEAD(list);
1378
1379         ign = net_generic(net, ipgre_net_id);
1380         rtnl_lock();
1381         ipgre_destroy_tunnels(ign, &list);
1382         unregister_netdevice_many(&list);
1383         rtnl_unlock();
1384 }
1385
1386 static struct pernet_operations ipgre_net_ops = {
1387         .init = ipgre_init_net,
1388         .exit = ipgre_exit_net,
1389         .id   = &ipgre_net_id,
1390         .size = sizeof(struct ipgre_net),
1391 };
1392
1393 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1394 {
1395         __be16 flags;
1396
1397         if (!data)
1398                 return 0;
1399
1400         flags = 0;
1401         if (data[IFLA_GRE_IFLAGS])
1402                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1403         if (data[IFLA_GRE_OFLAGS])
1404                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1405         if (flags & (GRE_VERSION|GRE_ROUTING))
1406                 return -EINVAL;
1407
1408         return 0;
1409 }
1410
1411 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1412 {
1413         __be32 daddr;
1414
1415         if (tb[IFLA_ADDRESS]) {
1416                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1417                         return -EINVAL;
1418                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1419                         return -EADDRNOTAVAIL;
1420         }
1421
1422         if (!data)
1423                 goto out;
1424
1425         if (data[IFLA_GRE_REMOTE]) {
1426                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1427                 if (!daddr)
1428                         return -EINVAL;
1429         }
1430
1431 out:
1432         return ipgre_tunnel_validate(tb, data);
1433 }
1434
1435 static void ipgre_netlink_parms(struct nlattr *data[],
1436                                 struct ip_tunnel_parm *parms)
1437 {
1438         memset(parms, 0, sizeof(*parms));
1439
1440         parms->iph.protocol = IPPROTO_GRE;
1441
1442         if (!data)
1443                 return;
1444
1445         if (data[IFLA_GRE_LINK])
1446                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1447
1448         if (data[IFLA_GRE_IFLAGS])
1449                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1450
1451         if (data[IFLA_GRE_OFLAGS])
1452                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1453
1454         if (data[IFLA_GRE_IKEY])
1455                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1456
1457         if (data[IFLA_GRE_OKEY])
1458                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1459
1460         if (data[IFLA_GRE_LOCAL])
1461                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1462
1463         if (data[IFLA_GRE_REMOTE])
1464                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1465
1466         if (data[IFLA_GRE_TTL])
1467                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1468
1469         if (data[IFLA_GRE_TOS])
1470                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1471
1472         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1473                 parms->iph.frag_off = htons(IP_DF);
1474 }
1475
1476 static int ipgre_tap_init(struct net_device *dev)
1477 {
1478         struct ip_tunnel *tunnel;
1479
1480         tunnel = netdev_priv(dev);
1481
1482         tunnel->dev = dev;
1483         strcpy(tunnel->parms.name, dev->name);
1484
1485         ipgre_tunnel_bind_dev(dev);
1486
1487         dev->tstats = alloc_percpu(struct pcpu_tstats);
1488         if (!dev->tstats)
1489                 return -ENOMEM;
1490
1491         return 0;
1492 }
1493
1494 static const struct net_device_ops ipgre_tap_netdev_ops = {
1495         .ndo_init               = ipgre_tap_init,
1496         .ndo_uninit             = ipgre_tunnel_uninit,
1497         .ndo_start_xmit         = ipgre_tunnel_xmit,
1498         .ndo_set_mac_address    = eth_mac_addr,
1499         .ndo_validate_addr      = eth_validate_addr,
1500         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1501         .ndo_get_stats          = ipgre_get_stats,
1502 };
1503
1504 static void ipgre_tap_setup(struct net_device *dev)
1505 {
1506
1507         ether_setup(dev);
1508
1509         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1510         dev->destructor         = ipgre_dev_free;
1511
1512         dev->iflink             = 0;
1513         dev->features           |= NETIF_F_NETNS_LOCAL;
1514 }
1515
1516 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1517                          struct nlattr *data[])
1518 {
1519         struct ip_tunnel *nt;
1520         struct net *net = dev_net(dev);
1521         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1522         int mtu;
1523         int err;
1524
1525         nt = netdev_priv(dev);
1526         ipgre_netlink_parms(data, &nt->parms);
1527
1528         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1529                 return -EEXIST;
1530
1531         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1532                 random_ether_addr(dev->dev_addr);
1533
1534         mtu = ipgre_tunnel_bind_dev(dev);
1535         if (!tb[IFLA_MTU])
1536                 dev->mtu = mtu;
1537
1538         /* Can use a lockless transmit, unless we generate output sequences */
1539         if (!(nt->parms.o_flags & GRE_SEQ))
1540                 dev->features |= NETIF_F_LLTX;
1541
1542         err = register_netdevice(dev);
1543         if (err)
1544                 goto out;
1545
1546         dev_hold(dev);
1547         ipgre_tunnel_link(ign, nt);
1548
1549 out:
1550         return err;
1551 }
1552
1553 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1554                             struct nlattr *data[])
1555 {
1556         struct ip_tunnel *t, *nt;
1557         struct net *net = dev_net(dev);
1558         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1559         struct ip_tunnel_parm p;
1560         int mtu;
1561
1562         if (dev == ign->fb_tunnel_dev)
1563                 return -EINVAL;
1564
1565         nt = netdev_priv(dev);
1566         ipgre_netlink_parms(data, &p);
1567
1568         t = ipgre_tunnel_locate(net, &p, 0);
1569
1570         if (t) {
1571                 if (t->dev != dev)
1572                         return -EEXIST;
1573         } else {
1574                 t = nt;
1575
1576                 if (dev->type != ARPHRD_ETHER) {
1577                         unsigned int nflags = 0;
1578
1579                         if (ipv4_is_multicast(p.iph.daddr))
1580                                 nflags = IFF_BROADCAST;
1581                         else if (p.iph.daddr)
1582                                 nflags = IFF_POINTOPOINT;
1583
1584                         if ((dev->flags ^ nflags) &
1585                             (IFF_POINTOPOINT | IFF_BROADCAST))
1586                                 return -EINVAL;
1587                 }
1588
1589                 ipgre_tunnel_unlink(ign, t);
1590                 t->parms.iph.saddr = p.iph.saddr;
1591                 t->parms.iph.daddr = p.iph.daddr;
1592                 t->parms.i_key = p.i_key;
1593                 if (dev->type != ARPHRD_ETHER) {
1594                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1595                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1596                 }
1597                 ipgre_tunnel_link(ign, t);
1598                 netdev_state_change(dev);
1599         }
1600
1601         t->parms.o_key = p.o_key;
1602         t->parms.iph.ttl = p.iph.ttl;
1603         t->parms.iph.tos = p.iph.tos;
1604         t->parms.iph.frag_off = p.iph.frag_off;
1605
1606         if (t->parms.link != p.link) {
1607                 t->parms.link = p.link;
1608                 mtu = ipgre_tunnel_bind_dev(dev);
1609                 if (!tb[IFLA_MTU])
1610                         dev->mtu = mtu;
1611                 netdev_state_change(dev);
1612         }
1613
1614         return 0;
1615 }
1616
1617 static size_t ipgre_get_size(const struct net_device *dev)
1618 {
1619         return
1620                 /* IFLA_GRE_LINK */
1621                 nla_total_size(4) +
1622                 /* IFLA_GRE_IFLAGS */
1623                 nla_total_size(2) +
1624                 /* IFLA_GRE_OFLAGS */
1625                 nla_total_size(2) +
1626                 /* IFLA_GRE_IKEY */
1627                 nla_total_size(4) +
1628                 /* IFLA_GRE_OKEY */
1629                 nla_total_size(4) +
1630                 /* IFLA_GRE_LOCAL */
1631                 nla_total_size(4) +
1632                 /* IFLA_GRE_REMOTE */
1633                 nla_total_size(4) +
1634                 /* IFLA_GRE_TTL */
1635                 nla_total_size(1) +
1636                 /* IFLA_GRE_TOS */
1637                 nla_total_size(1) +
1638                 /* IFLA_GRE_PMTUDISC */
1639                 nla_total_size(1) +
1640                 0;
1641 }
1642
1643 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1644 {
1645         struct ip_tunnel *t = netdev_priv(dev);
1646         struct ip_tunnel_parm *p = &t->parms;
1647
1648         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1649         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1650         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1651         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1652         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1653         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1654         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1655         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1656         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1657         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1658
1659         return 0;
1660
1661 nla_put_failure:
1662         return -EMSGSIZE;
1663 }
1664
1665 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1666         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1667         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1668         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1669         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1670         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1671         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1672         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1673         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1674         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1675         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1676 };
1677
1678 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1679         .kind           = "gre",
1680         .maxtype        = IFLA_GRE_MAX,
1681         .policy         = ipgre_policy,
1682         .priv_size      = sizeof(struct ip_tunnel),
1683         .setup          = ipgre_tunnel_setup,
1684         .validate       = ipgre_tunnel_validate,
1685         .newlink        = ipgre_newlink,
1686         .changelink     = ipgre_changelink,
1687         .get_size       = ipgre_get_size,
1688         .fill_info      = ipgre_fill_info,
1689 };
1690
1691 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1692         .kind           = "gretap",
1693         .maxtype        = IFLA_GRE_MAX,
1694         .policy         = ipgre_policy,
1695         .priv_size      = sizeof(struct ip_tunnel),
1696         .setup          = ipgre_tap_setup,
1697         .validate       = ipgre_tap_validate,
1698         .newlink        = ipgre_newlink,
1699         .changelink     = ipgre_changelink,
1700         .get_size       = ipgre_get_size,
1701         .fill_info      = ipgre_fill_info,
1702 };
1703
1704 /*
1705  *      And now the modules code and kernel interface.
1706  */
1707
1708 static int __init ipgre_init(void)
1709 {
1710         int err;
1711
1712         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1713
1714         err = register_pernet_device(&ipgre_net_ops);
1715         if (err < 0)
1716                 return err;
1717
1718         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1719         if (err < 0) {
1720                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1721                 goto add_proto_failed;
1722         }
1723
1724         err = rtnl_link_register(&ipgre_link_ops);
1725         if (err < 0)
1726                 goto rtnl_link_failed;
1727
1728         err = rtnl_link_register(&ipgre_tap_ops);
1729         if (err < 0)
1730                 goto tap_ops_failed;
1731
1732 out:
1733         return err;
1734
1735 tap_ops_failed:
1736         rtnl_link_unregister(&ipgre_link_ops);
1737 rtnl_link_failed:
1738         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1739 add_proto_failed:
1740         unregister_pernet_device(&ipgre_net_ops);
1741         goto out;
1742 }
1743
1744 static void __exit ipgre_fini(void)
1745 {
1746         rtnl_link_unregister(&ipgre_tap_ops);
1747         rtnl_link_unregister(&ipgre_link_ops);
1748         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1749                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1750         unregister_pernet_device(&ipgre_net_ops);
1751 }
1752
1753 module_init(ipgre_init);
1754 module_exit(ipgre_fini);
1755 MODULE_LICENSE("GPL");
1756 MODULE_ALIAS_RTNL_LINK("gre");
1757 MODULE_ALIAS_RTNL_LINK("gretap");
1758 MODULE_ALIAS_NETDEV("gre0");