]> rtime.felk.cvut.cz Git - can-eth-gw-linux.git/blob - net/ipv4/ip_gre.c
1c012cb2cb941ad09132f0eceb18de7dd24737eb
[can-eth-gw-linux.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58    Problems & solutions
59    --------------------
60
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110
111
112
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119
120    Alexey Kuznetsov.
121  */
122
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127
128 /* Fallback tunnel: no source, no destination, no key, no options */
129
130 #define HASH_SIZE  16
131
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135
136         struct net_device *fb_tunnel_dev;
137 };
138
139 /* Tunnel hash table */
140
141 /*
142    4 hash tables:
143
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158
159 #define tunnels_r_l     tunnels[3]
160 #define tunnels_r       tunnels[2]
161 #define tunnels_l       tunnels[1]
162 #define tunnels_wc      tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166
167 #define for_each_ip_tunnel_rcu(start) \
168         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172         u64     rx_packets;
173         u64     rx_bytes;
174         u64     tx_packets;
175         u64     tx_bytes;
176         struct u64_stats_sync   syncp;
177 };
178
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180                                                    struct rtnl_link_stats64 *tot)
181 {
182         int i;
183
184         for_each_possible_cpu(i) {
185                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187                 unsigned int start;
188
189                 do {
190                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
191                         rx_packets = tstats->rx_packets;
192                         tx_packets = tstats->tx_packets;
193                         rx_bytes = tstats->rx_bytes;
194                         tx_bytes = tstats->tx_bytes;
195                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196
197                 tot->rx_packets += rx_packets;
198                 tot->tx_packets += tx_packets;
199                 tot->rx_bytes   += rx_bytes;
200                 tot->tx_bytes   += tx_bytes;
201         }
202
203         tot->multicast = dev->stats.multicast;
204         tot->rx_crc_errors = dev->stats.rx_crc_errors;
205         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206         tot->rx_length_errors = dev->stats.rx_length_errors;
207         tot->rx_errors = dev->stats.rx_errors;
208         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210         tot->tx_dropped = dev->stats.tx_dropped;
211         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212         tot->tx_errors = dev->stats.tx_errors;
213
214         return tot;
215 }
216
217 /* Does key in tunnel parameters match packet */
218 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
219                             __u32 flags, __be32 key)
220 {
221         if (p->i_flags & GRE_KEY) {
222                 if (flags & GRE_KEY)
223                         return key == p->i_key;
224                 else
225                         return false;   /* key expected, none present */
226         } else
227                 return !(flags & GRE_KEY);
228 }
229
230 /* Given src, dst and key, find appropriate for input tunnel. */
231
232 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
233                                              __be32 remote, __be32 local,
234                                              __u32 flags, __be32 key,
235                                              __be16 gre_proto)
236 {
237         struct net *net = dev_net(dev);
238         int link = dev->ifindex;
239         unsigned int h0 = HASH(remote);
240         unsigned int h1 = HASH(key);
241         struct ip_tunnel *t, *cand = NULL;
242         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
243         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
244                        ARPHRD_ETHER : ARPHRD_IPGRE;
245         int score, cand_score = 4;
246
247         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
248                 if (local != t->parms.iph.saddr ||
249                     remote != t->parms.iph.daddr ||
250                     !(t->dev->flags & IFF_UP))
251                         continue;
252
253                 if (!ipgre_key_match(&t->parms, flags, key))
254                         continue;
255
256                 if (t->dev->type != ARPHRD_IPGRE &&
257                     t->dev->type != dev_type)
258                         continue;
259
260                 score = 0;
261                 if (t->parms.link != link)
262                         score |= 1;
263                 if (t->dev->type != dev_type)
264                         score |= 2;
265                 if (score == 0)
266                         return t;
267
268                 if (score < cand_score) {
269                         cand = t;
270                         cand_score = score;
271                 }
272         }
273
274         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
275                 if (remote != t->parms.iph.daddr ||
276                     !(t->dev->flags & IFF_UP))
277                         continue;
278
279                 if (!ipgre_key_match(&t->parms, flags, key))
280                         continue;
281
282                 if (t->dev->type != ARPHRD_IPGRE &&
283                     t->dev->type != dev_type)
284                         continue;
285
286                 score = 0;
287                 if (t->parms.link != link)
288                         score |= 1;
289                 if (t->dev->type != dev_type)
290                         score |= 2;
291                 if (score == 0)
292                         return t;
293
294                 if (score < cand_score) {
295                         cand = t;
296                         cand_score = score;
297                 }
298         }
299
300         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
301                 if ((local != t->parms.iph.saddr &&
302                      (local != t->parms.iph.daddr ||
303                       !ipv4_is_multicast(local))) ||
304                     !(t->dev->flags & IFF_UP))
305                         continue;
306
307                 if (!ipgre_key_match(&t->parms, flags, key))
308                         continue;
309
310                 if (t->dev->type != ARPHRD_IPGRE &&
311                     t->dev->type != dev_type)
312                         continue;
313
314                 score = 0;
315                 if (t->parms.link != link)
316                         score |= 1;
317                 if (t->dev->type != dev_type)
318                         score |= 2;
319                 if (score == 0)
320                         return t;
321
322                 if (score < cand_score) {
323                         cand = t;
324                         cand_score = score;
325                 }
326         }
327
328         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
329                 if (t->parms.i_key != key ||
330                     !(t->dev->flags & IFF_UP))
331                         continue;
332
333                 if (t->dev->type != ARPHRD_IPGRE &&
334                     t->dev->type != dev_type)
335                         continue;
336
337                 score = 0;
338                 if (t->parms.link != link)
339                         score |= 1;
340                 if (t->dev->type != dev_type)
341                         score |= 2;
342                 if (score == 0)
343                         return t;
344
345                 if (score < cand_score) {
346                         cand = t;
347                         cand_score = score;
348                 }
349         }
350
351         if (cand != NULL)
352                 return cand;
353
354         dev = ign->fb_tunnel_dev;
355         if (dev->flags & IFF_UP)
356                 return netdev_priv(dev);
357
358         return NULL;
359 }
360
361 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
362                 struct ip_tunnel_parm *parms)
363 {
364         __be32 remote = parms->iph.daddr;
365         __be32 local = parms->iph.saddr;
366         __be32 key = parms->i_key;
367         unsigned int h = HASH(key);
368         int prio = 0;
369
370         if (local)
371                 prio |= 1;
372         if (remote && !ipv4_is_multicast(remote)) {
373                 prio |= 2;
374                 h ^= HASH(remote);
375         }
376
377         return &ign->tunnels[prio][h];
378 }
379
380 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
381                 struct ip_tunnel *t)
382 {
383         return __ipgre_bucket(ign, &t->parms);
384 }
385
386 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
387 {
388         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
389
390         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
391         rcu_assign_pointer(*tp, t);
392 }
393
394 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
395 {
396         struct ip_tunnel __rcu **tp;
397         struct ip_tunnel *iter;
398
399         for (tp = ipgre_bucket(ign, t);
400              (iter = rtnl_dereference(*tp)) != NULL;
401              tp = &iter->next) {
402                 if (t == iter) {
403                         rcu_assign_pointer(*tp, t->next);
404                         break;
405                 }
406         }
407 }
408
409 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
410                                            struct ip_tunnel_parm *parms,
411                                            int type)
412 {
413         __be32 remote = parms->iph.daddr;
414         __be32 local = parms->iph.saddr;
415         __be32 key = parms->i_key;
416         int link = parms->link;
417         struct ip_tunnel *t;
418         struct ip_tunnel __rcu **tp;
419         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
420
421         for (tp = __ipgre_bucket(ign, parms);
422              (t = rtnl_dereference(*tp)) != NULL;
423              tp = &t->next)
424                 if (local == t->parms.iph.saddr &&
425                     remote == t->parms.iph.daddr &&
426                     key == t->parms.i_key &&
427                     link == t->parms.link &&
428                     type == t->dev->type)
429                         break;
430
431         return t;
432 }
433
434 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
435                 struct ip_tunnel_parm *parms, int create)
436 {
437         struct ip_tunnel *t, *nt;
438         struct net_device *dev;
439         char name[IFNAMSIZ];
440         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
441
442         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
443         if (t || !create)
444                 return t;
445
446         if (parms->name[0])
447                 strlcpy(name, parms->name, IFNAMSIZ);
448         else
449                 strcpy(name, "gre%d");
450
451         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
452         if (!dev)
453                 return NULL;
454
455         dev_net_set(dev, net);
456
457         nt = netdev_priv(dev);
458         nt->parms = *parms;
459         dev->rtnl_link_ops = &ipgre_link_ops;
460
461         dev->mtu = ipgre_tunnel_bind_dev(dev);
462
463         if (register_netdevice(dev) < 0)
464                 goto failed_free;
465
466         /* Can use a lockless transmit, unless we generate output sequences */
467         if (!(nt->parms.o_flags & GRE_SEQ))
468                 dev->features |= NETIF_F_LLTX;
469
470         dev_hold(dev);
471         ipgre_tunnel_link(ign, nt);
472         return nt;
473
474 failed_free:
475         free_netdev(dev);
476         return NULL;
477 }
478
479 static void ipgre_tunnel_uninit(struct net_device *dev)
480 {
481         struct net *net = dev_net(dev);
482         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
483
484         ipgre_tunnel_unlink(ign, netdev_priv(dev));
485         dev_put(dev);
486 }
487
488
489 static void ipgre_err(struct sk_buff *skb, u32 info)
490 {
491
492 /* All the routers (except for Linux) return only
493    8 bytes of packet payload. It means, that precise relaying of
494    ICMP in the real Internet is absolutely infeasible.
495
496    Moreover, Cisco "wise men" put GRE key to the third word
497    in GRE header. It makes impossible maintaining even soft state for keyed
498    GRE tunnels with enabled checksum. Tell them "thank you".
499
500    Well, I wonder, rfc1812 was written by Cisco employee,
501    what the hell these idiots break standards established
502    by themselves???
503  */
504
505         const struct iphdr *iph = (const struct iphdr *)skb->data;
506         __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
507         int grehlen = (iph->ihl<<2) + 4;
508         const int type = icmp_hdr(skb)->type;
509         const int code = icmp_hdr(skb)->code;
510         struct ip_tunnel *t;
511         __be16 flags;
512         __be32 key = 0;
513
514         flags = p[0];
515         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
516                 if (flags&(GRE_VERSION|GRE_ROUTING))
517                         return;
518                 if (flags&GRE_KEY) {
519                         grehlen += 4;
520                         if (flags&GRE_CSUM)
521                                 grehlen += 4;
522                 }
523         }
524
525         /* If only 8 bytes returned, keyed message will be dropped here */
526         if (skb_headlen(skb) < grehlen)
527                 return;
528
529         if (flags & GRE_KEY)
530                 key = *(((__be32 *)p) + (grehlen / 4) - 1);
531
532         switch (type) {
533         default:
534         case ICMP_PARAMETERPROB:
535                 return;
536
537         case ICMP_DEST_UNREACH:
538                 switch (code) {
539                 case ICMP_SR_FAILED:
540                 case ICMP_PORT_UNREACH:
541                         /* Impossible event. */
542                         return;
543                 default:
544                         /* All others are translated to HOST_UNREACH.
545                            rfc2003 contains "deep thoughts" about NET_UNREACH,
546                            I believe they are just ether pollution. --ANK
547                          */
548                         break;
549                 }
550                 break;
551         case ICMP_TIME_EXCEEDED:
552                 if (code != ICMP_EXC_TTL)
553                         return;
554                 break;
555
556         case ICMP_REDIRECT:
557                 break;
558         }
559
560         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
561                                 flags, key, p[1]);
562
563         if (t == NULL)
564                 return;
565
566         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
567                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
568                                  t->parms.link, 0, IPPROTO_GRE, 0);
569                 return;
570         }
571         if (type == ICMP_REDIRECT) {
572                 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
573                               IPPROTO_GRE, 0);
574                 return;
575         }
576         if (t->parms.iph.daddr == 0 ||
577             ipv4_is_multicast(t->parms.iph.daddr))
578                 return;
579
580         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
581                 return;
582
583         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
584                 t->err_count++;
585         else
586                 t->err_count = 1;
587         t->err_time = jiffies;
588 }
589
590 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
591 {
592         if (INET_ECN_is_ce(iph->tos)) {
593                 if (skb->protocol == htons(ETH_P_IP)) {
594                         IP_ECN_set_ce(ip_hdr(skb));
595                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
596                         IP6_ECN_set_ce(ipv6_hdr(skb));
597                 }
598         }
599 }
600
601 static inline u8
602 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
603 {
604         u8 inner = 0;
605         if (skb->protocol == htons(ETH_P_IP))
606                 inner = old_iph->tos;
607         else if (skb->protocol == htons(ETH_P_IPV6))
608                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
609         return INET_ECN_encapsulate(tos, inner);
610 }
611
612 static int ipgre_rcv(struct sk_buff *skb)
613 {
614         const struct iphdr *iph;
615         u8     *h;
616         __be16    flags;
617         __sum16   csum = 0;
618         __be32 key = 0;
619         u32    seqno = 0;
620         struct ip_tunnel *tunnel;
621         int    offset = 4;
622         __be16 gre_proto;
623
624         if (!pskb_may_pull(skb, 16))
625                 goto drop;
626
627         iph = ip_hdr(skb);
628         h = skb->data;
629         flags = *(__be16 *)h;
630
631         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
632                 /* - Version must be 0.
633                    - We do not support routing headers.
634                  */
635                 if (flags&(GRE_VERSION|GRE_ROUTING))
636                         goto drop;
637
638                 if (flags&GRE_CSUM) {
639                         switch (skb->ip_summed) {
640                         case CHECKSUM_COMPLETE:
641                                 csum = csum_fold(skb->csum);
642                                 if (!csum)
643                                         break;
644                                 /* fall through */
645                         case CHECKSUM_NONE:
646                                 skb->csum = 0;
647                                 csum = __skb_checksum_complete(skb);
648                                 skb->ip_summed = CHECKSUM_COMPLETE;
649                         }
650                         offset += 4;
651                 }
652                 if (flags&GRE_KEY) {
653                         key = *(__be32 *)(h + offset);
654                         offset += 4;
655                 }
656                 if (flags&GRE_SEQ) {
657                         seqno = ntohl(*(__be32 *)(h + offset));
658                         offset += 4;
659                 }
660         }
661
662         gre_proto = *(__be16 *)(h + 2);
663
664         tunnel = ipgre_tunnel_lookup(skb->dev,
665                                      iph->saddr, iph->daddr, flags, key,
666                                      gre_proto);
667         if (tunnel) {
668                 struct pcpu_tstats *tstats;
669
670                 secpath_reset(skb);
671
672                 skb->protocol = gre_proto;
673                 /* WCCP version 1 and 2 protocol decoding.
674                  * - Change protocol to IP
675                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
676                  */
677                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
678                         skb->protocol = htons(ETH_P_IP);
679                         if ((*(h + offset) & 0xF0) != 0x40)
680                                 offset += 4;
681                 }
682
683                 skb->mac_header = skb->network_header;
684                 __pskb_pull(skb, offset);
685                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
686                 skb->pkt_type = PACKET_HOST;
687 #ifdef CONFIG_NET_IPGRE_BROADCAST
688                 if (ipv4_is_multicast(iph->daddr)) {
689                         /* Looped back packet, drop it! */
690                         if (rt_is_output_route(skb_rtable(skb)))
691                                 goto drop;
692                         tunnel->dev->stats.multicast++;
693                         skb->pkt_type = PACKET_BROADCAST;
694                 }
695 #endif
696
697                 if (((flags&GRE_CSUM) && csum) ||
698                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
699                         tunnel->dev->stats.rx_crc_errors++;
700                         tunnel->dev->stats.rx_errors++;
701                         goto drop;
702                 }
703                 if (tunnel->parms.i_flags&GRE_SEQ) {
704                         if (!(flags&GRE_SEQ) ||
705                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
706                                 tunnel->dev->stats.rx_fifo_errors++;
707                                 tunnel->dev->stats.rx_errors++;
708                                 goto drop;
709                         }
710                         tunnel->i_seqno = seqno + 1;
711                 }
712
713                 /* Warning: All skb pointers will be invalidated! */
714                 if (tunnel->dev->type == ARPHRD_ETHER) {
715                         if (!pskb_may_pull(skb, ETH_HLEN)) {
716                                 tunnel->dev->stats.rx_length_errors++;
717                                 tunnel->dev->stats.rx_errors++;
718                                 goto drop;
719                         }
720
721                         iph = ip_hdr(skb);
722                         skb->protocol = eth_type_trans(skb, tunnel->dev);
723                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
724                 }
725
726                 tstats = this_cpu_ptr(tunnel->dev->tstats);
727                 u64_stats_update_begin(&tstats->syncp);
728                 tstats->rx_packets++;
729                 tstats->rx_bytes += skb->len;
730                 u64_stats_update_end(&tstats->syncp);
731
732                 __skb_tunnel_rx(skb, tunnel->dev);
733
734                 skb_reset_network_header(skb);
735                 ipgre_ecn_decapsulate(iph, skb);
736
737                 netif_rx(skb);
738
739                 return 0;
740         }
741         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
742
743 drop:
744         kfree_skb(skb);
745         return 0;
746 }
747
748 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
749 {
750         struct ip_tunnel *tunnel = netdev_priv(dev);
751         struct pcpu_tstats *tstats;
752         const struct iphdr  *old_iph = ip_hdr(skb);
753         const struct iphdr  *tiph;
754         struct flowi4 fl4;
755         u8     tos;
756         __be16 df;
757         struct rtable *rt;                      /* Route to the other host */
758         struct net_device *tdev;                /* Device to other host */
759         struct iphdr  *iph;                     /* Our new IP header */
760         unsigned int max_headroom;              /* The extra header space needed */
761         int    gre_hlen;
762         __be32 dst;
763         int    mtu;
764
765         if (skb->ip_summed == CHECKSUM_PARTIAL &&
766             skb_checksum_help(skb))
767                 goto tx_error;
768
769         if (dev->type == ARPHRD_ETHER)
770                 IPCB(skb)->flags = 0;
771
772         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
773                 gre_hlen = 0;
774                 tiph = (const struct iphdr *)skb->data;
775         } else {
776                 gre_hlen = tunnel->hlen;
777                 tiph = &tunnel->parms.iph;
778         }
779
780         if ((dst = tiph->daddr) == 0) {
781                 /* NBMA tunnel */
782
783                 if (skb_dst(skb) == NULL) {
784                         dev->stats.tx_fifo_errors++;
785                         goto tx_error;
786                 }
787
788                 if (skb->protocol == htons(ETH_P_IP)) {
789                         rt = skb_rtable(skb);
790                         dst = rt_nexthop(rt, old_iph->daddr);
791                 }
792 #if IS_ENABLED(CONFIG_IPV6)
793                 else if (skb->protocol == htons(ETH_P_IPV6)) {
794                         const struct in6_addr *addr6;
795                         struct neighbour *neigh;
796                         bool do_tx_error_icmp;
797                         int addr_type;
798
799                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
800                         if (neigh == NULL)
801                                 goto tx_error;
802
803                         addr6 = (const struct in6_addr *)&neigh->primary_key;
804                         addr_type = ipv6_addr_type(addr6);
805
806                         if (addr_type == IPV6_ADDR_ANY) {
807                                 addr6 = &ipv6_hdr(skb)->daddr;
808                                 addr_type = ipv6_addr_type(addr6);
809                         }
810
811                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
812                                 do_tx_error_icmp = true;
813                         else {
814                                 do_tx_error_icmp = false;
815                                 dst = addr6->s6_addr32[3];
816                         }
817                         neigh_release(neigh);
818                         if (do_tx_error_icmp)
819                                 goto tx_error_icmp;
820                 }
821 #endif
822                 else
823                         goto tx_error;
824         }
825
826         tos = tiph->tos;
827         if (tos == 1) {
828                 tos = 0;
829                 if (skb->protocol == htons(ETH_P_IP))
830                         tos = old_iph->tos;
831                 else if (skb->protocol == htons(ETH_P_IPV6))
832                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
833         }
834
835         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
836                                  tunnel->parms.o_key, RT_TOS(tos),
837                                  tunnel->parms.link);
838         if (IS_ERR(rt)) {
839                 dev->stats.tx_carrier_errors++;
840                 goto tx_error;
841         }
842         tdev = rt->dst.dev;
843
844         if (tdev == dev) {
845                 ip_rt_put(rt);
846                 dev->stats.collisions++;
847                 goto tx_error;
848         }
849
850         df = tiph->frag_off;
851         if (df)
852                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
853         else
854                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
855
856         if (skb_dst(skb))
857                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
858
859         if (skb->protocol == htons(ETH_P_IP)) {
860                 df |= (old_iph->frag_off&htons(IP_DF));
861
862                 if ((old_iph->frag_off&htons(IP_DF)) &&
863                     mtu < ntohs(old_iph->tot_len)) {
864                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
865                         ip_rt_put(rt);
866                         goto tx_error;
867                 }
868         }
869 #if IS_ENABLED(CONFIG_IPV6)
870         else if (skb->protocol == htons(ETH_P_IPV6)) {
871                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
872
873                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
874                         if ((tunnel->parms.iph.daddr &&
875                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
876                             rt6->rt6i_dst.plen == 128) {
877                                 rt6->rt6i_flags |= RTF_MODIFIED;
878                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
879                         }
880                 }
881
882                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
883                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
884                         ip_rt_put(rt);
885                         goto tx_error;
886                 }
887         }
888 #endif
889
890         if (tunnel->err_count > 0) {
891                 if (time_before(jiffies,
892                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
893                         tunnel->err_count--;
894
895                         dst_link_failure(skb);
896                 } else
897                         tunnel->err_count = 0;
898         }
899
900         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
901
902         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
903             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
904                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
905                 if (max_headroom > dev->needed_headroom)
906                         dev->needed_headroom = max_headroom;
907                 if (!new_skb) {
908                         ip_rt_put(rt);
909                         dev->stats.tx_dropped++;
910                         dev_kfree_skb(skb);
911                         return NETDEV_TX_OK;
912                 }
913                 if (skb->sk)
914                         skb_set_owner_w(new_skb, skb->sk);
915                 dev_kfree_skb(skb);
916                 skb = new_skb;
917                 old_iph = ip_hdr(skb);
918         }
919
920         skb_reset_transport_header(skb);
921         skb_push(skb, gre_hlen);
922         skb_reset_network_header(skb);
923         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
924         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
925                               IPSKB_REROUTED);
926         skb_dst_drop(skb);
927         skb_dst_set(skb, &rt->dst);
928
929         /*
930          *      Push down and install the IPIP header.
931          */
932
933         iph                     =       ip_hdr(skb);
934         iph->version            =       4;
935         iph->ihl                =       sizeof(struct iphdr) >> 2;
936         iph->frag_off           =       df;
937         iph->protocol           =       IPPROTO_GRE;
938         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
939         iph->daddr              =       fl4.daddr;
940         iph->saddr              =       fl4.saddr;
941
942         if ((iph->ttl = tiph->ttl) == 0) {
943                 if (skb->protocol == htons(ETH_P_IP))
944                         iph->ttl = old_iph->ttl;
945 #if IS_ENABLED(CONFIG_IPV6)
946                 else if (skb->protocol == htons(ETH_P_IPV6))
947                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
948 #endif
949                 else
950                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
951         }
952
953         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
954         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
955                                    htons(ETH_P_TEB) : skb->protocol;
956
957         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
958                 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
959
960                 if (tunnel->parms.o_flags&GRE_SEQ) {
961                         ++tunnel->o_seqno;
962                         *ptr = htonl(tunnel->o_seqno);
963                         ptr--;
964                 }
965                 if (tunnel->parms.o_flags&GRE_KEY) {
966                         *ptr = tunnel->parms.o_key;
967                         ptr--;
968                 }
969                 if (tunnel->parms.o_flags&GRE_CSUM) {
970                         *ptr = 0;
971                         *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
972                 }
973         }
974
975         nf_reset(skb);
976         tstats = this_cpu_ptr(dev->tstats);
977         __IPTUNNEL_XMIT(tstats, &dev->stats);
978         return NETDEV_TX_OK;
979
980 #if IS_ENABLED(CONFIG_IPV6)
981 tx_error_icmp:
982         dst_link_failure(skb);
983 #endif
984 tx_error:
985         dev->stats.tx_errors++;
986         dev_kfree_skb(skb);
987         return NETDEV_TX_OK;
988 }
989
990 static int ipgre_tunnel_bind_dev(struct net_device *dev)
991 {
992         struct net_device *tdev = NULL;
993         struct ip_tunnel *tunnel;
994         const struct iphdr *iph;
995         int hlen = LL_MAX_HEADER;
996         int mtu = ETH_DATA_LEN;
997         int addend = sizeof(struct iphdr) + 4;
998
999         tunnel = netdev_priv(dev);
1000         iph = &tunnel->parms.iph;
1001
1002         /* Guess output device to choose reasonable mtu and needed_headroom */
1003
1004         if (iph->daddr) {
1005                 struct flowi4 fl4;
1006                 struct rtable *rt;
1007
1008                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1009                                          iph->daddr, iph->saddr,
1010                                          tunnel->parms.o_key,
1011                                          RT_TOS(iph->tos),
1012                                          tunnel->parms.link);
1013                 if (!IS_ERR(rt)) {
1014                         tdev = rt->dst.dev;
1015                         ip_rt_put(rt);
1016                 }
1017
1018                 if (dev->type != ARPHRD_ETHER)
1019                         dev->flags |= IFF_POINTOPOINT;
1020         }
1021
1022         if (!tdev && tunnel->parms.link)
1023                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1024
1025         if (tdev) {
1026                 hlen = tdev->hard_header_len + tdev->needed_headroom;
1027                 mtu = tdev->mtu;
1028         }
1029         dev->iflink = tunnel->parms.link;
1030
1031         /* Precalculate GRE options length */
1032         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1033                 if (tunnel->parms.o_flags&GRE_CSUM)
1034                         addend += 4;
1035                 if (tunnel->parms.o_flags&GRE_KEY)
1036                         addend += 4;
1037                 if (tunnel->parms.o_flags&GRE_SEQ)
1038                         addend += 4;
1039         }
1040         dev->needed_headroom = addend + hlen;
1041         mtu -= dev->hard_header_len + addend;
1042
1043         if (mtu < 68)
1044                 mtu = 68;
1045
1046         tunnel->hlen = addend;
1047
1048         return mtu;
1049 }
1050
1051 static int
1052 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1053 {
1054         int err = 0;
1055         struct ip_tunnel_parm p;
1056         struct ip_tunnel *t;
1057         struct net *net = dev_net(dev);
1058         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1059
1060         switch (cmd) {
1061         case SIOCGETTUNNEL:
1062                 t = NULL;
1063                 if (dev == ign->fb_tunnel_dev) {
1064                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1065                                 err = -EFAULT;
1066                                 break;
1067                         }
1068                         t = ipgre_tunnel_locate(net, &p, 0);
1069                 }
1070                 if (t == NULL)
1071                         t = netdev_priv(dev);
1072                 memcpy(&p, &t->parms, sizeof(p));
1073                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1074                         err = -EFAULT;
1075                 break;
1076
1077         case SIOCADDTUNNEL:
1078         case SIOCCHGTUNNEL:
1079                 err = -EPERM;
1080                 if (!capable(CAP_NET_ADMIN))
1081                         goto done;
1082
1083                 err = -EFAULT;
1084                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1085                         goto done;
1086
1087                 err = -EINVAL;
1088                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1089                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1090                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1091                         goto done;
1092                 if (p.iph.ttl)
1093                         p.iph.frag_off |= htons(IP_DF);
1094
1095                 if (!(p.i_flags&GRE_KEY))
1096                         p.i_key = 0;
1097                 if (!(p.o_flags&GRE_KEY))
1098                         p.o_key = 0;
1099
1100                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1101
1102                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1103                         if (t != NULL) {
1104                                 if (t->dev != dev) {
1105                                         err = -EEXIST;
1106                                         break;
1107                                 }
1108                         } else {
1109                                 unsigned int nflags = 0;
1110
1111                                 t = netdev_priv(dev);
1112
1113                                 if (ipv4_is_multicast(p.iph.daddr))
1114                                         nflags = IFF_BROADCAST;
1115                                 else if (p.iph.daddr)
1116                                         nflags = IFF_POINTOPOINT;
1117
1118                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1119                                         err = -EINVAL;
1120                                         break;
1121                                 }
1122                                 ipgre_tunnel_unlink(ign, t);
1123                                 synchronize_net();
1124                                 t->parms.iph.saddr = p.iph.saddr;
1125                                 t->parms.iph.daddr = p.iph.daddr;
1126                                 t->parms.i_key = p.i_key;
1127                                 t->parms.o_key = p.o_key;
1128                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1129                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1130                                 ipgre_tunnel_link(ign, t);
1131                                 netdev_state_change(dev);
1132                         }
1133                 }
1134
1135                 if (t) {
1136                         err = 0;
1137                         if (cmd == SIOCCHGTUNNEL) {
1138                                 t->parms.iph.ttl = p.iph.ttl;
1139                                 t->parms.iph.tos = p.iph.tos;
1140                                 t->parms.iph.frag_off = p.iph.frag_off;
1141                                 if (t->parms.link != p.link) {
1142                                         t->parms.link = p.link;
1143                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1144                                         netdev_state_change(dev);
1145                                 }
1146                         }
1147                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1148                                 err = -EFAULT;
1149                 } else
1150                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1151                 break;
1152
1153         case SIOCDELTUNNEL:
1154                 err = -EPERM;
1155                 if (!capable(CAP_NET_ADMIN))
1156                         goto done;
1157
1158                 if (dev == ign->fb_tunnel_dev) {
1159                         err = -EFAULT;
1160                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1161                                 goto done;
1162                         err = -ENOENT;
1163                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1164                                 goto done;
1165                         err = -EPERM;
1166                         if (t == netdev_priv(ign->fb_tunnel_dev))
1167                                 goto done;
1168                         dev = t->dev;
1169                 }
1170                 unregister_netdevice(dev);
1171                 err = 0;
1172                 break;
1173
1174         default:
1175                 err = -EINVAL;
1176         }
1177
1178 done:
1179         return err;
1180 }
1181
1182 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1183 {
1184         struct ip_tunnel *tunnel = netdev_priv(dev);
1185         if (new_mtu < 68 ||
1186             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1187                 return -EINVAL;
1188         dev->mtu = new_mtu;
1189         return 0;
1190 }
1191
1192 /* Nice toy. Unfortunately, useless in real life :-)
1193    It allows to construct virtual multiprotocol broadcast "LAN"
1194    over the Internet, provided multicast routing is tuned.
1195
1196
1197    I have no idea was this bicycle invented before me,
1198    so that I had to set ARPHRD_IPGRE to a random value.
1199    I have an impression, that Cisco could make something similar,
1200    but this feature is apparently missing in IOS<=11.2(8).
1201
1202    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1203    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1204
1205    ping -t 255 224.66.66.66
1206
1207    If nobody answers, mbone does not work.
1208
1209    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1210    ip addr add 10.66.66.<somewhat>/24 dev Universe
1211    ifconfig Universe up
1212    ifconfig Universe add fe80::<Your_real_addr>/10
1213    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1214    ftp 10.66.66.66
1215    ...
1216    ftp fec0:6666:6666::193.233.7.65
1217    ...
1218
1219  */
1220
1221 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1222                         unsigned short type,
1223                         const void *daddr, const void *saddr, unsigned int len)
1224 {
1225         struct ip_tunnel *t = netdev_priv(dev);
1226         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1227         __be16 *p = (__be16 *)(iph+1);
1228
1229         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1230         p[0]            = t->parms.o_flags;
1231         p[1]            = htons(type);
1232
1233         /*
1234          *      Set the source hardware address.
1235          */
1236
1237         if (saddr)
1238                 memcpy(&iph->saddr, saddr, 4);
1239         if (daddr)
1240                 memcpy(&iph->daddr, daddr, 4);
1241         if (iph->daddr)
1242                 return t->hlen;
1243
1244         return -t->hlen;
1245 }
1246
1247 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1248 {
1249         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1250         memcpy(haddr, &iph->saddr, 4);
1251         return 4;
1252 }
1253
1254 static const struct header_ops ipgre_header_ops = {
1255         .create = ipgre_header,
1256         .parse  = ipgre_header_parse,
1257 };
1258
1259 #ifdef CONFIG_NET_IPGRE_BROADCAST
1260 static int ipgre_open(struct net_device *dev)
1261 {
1262         struct ip_tunnel *t = netdev_priv(dev);
1263
1264         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1265                 struct flowi4 fl4;
1266                 struct rtable *rt;
1267
1268                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1269                                          t->parms.iph.daddr,
1270                                          t->parms.iph.saddr,
1271                                          t->parms.o_key,
1272                                          RT_TOS(t->parms.iph.tos),
1273                                          t->parms.link);
1274                 if (IS_ERR(rt))
1275                         return -EADDRNOTAVAIL;
1276                 dev = rt->dst.dev;
1277                 ip_rt_put(rt);
1278                 if (__in_dev_get_rtnl(dev) == NULL)
1279                         return -EADDRNOTAVAIL;
1280                 t->mlink = dev->ifindex;
1281                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1282         }
1283         return 0;
1284 }
1285
1286 static int ipgre_close(struct net_device *dev)
1287 {
1288         struct ip_tunnel *t = netdev_priv(dev);
1289
1290         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1291                 struct in_device *in_dev;
1292                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1293                 if (in_dev)
1294                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1295         }
1296         return 0;
1297 }
1298
1299 #endif
1300
1301 static const struct net_device_ops ipgre_netdev_ops = {
1302         .ndo_init               = ipgre_tunnel_init,
1303         .ndo_uninit             = ipgre_tunnel_uninit,
1304 #ifdef CONFIG_NET_IPGRE_BROADCAST
1305         .ndo_open               = ipgre_open,
1306         .ndo_stop               = ipgre_close,
1307 #endif
1308         .ndo_start_xmit         = ipgre_tunnel_xmit,
1309         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1310         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1311         .ndo_get_stats64        = ipgre_get_stats64,
1312 };
1313
1314 static void ipgre_dev_free(struct net_device *dev)
1315 {
1316         free_percpu(dev->tstats);
1317         free_netdev(dev);
1318 }
1319
1320 #define GRE_FEATURES (NETIF_F_SG |              \
1321                       NETIF_F_FRAGLIST |        \
1322                       NETIF_F_HIGHDMA |         \
1323                       NETIF_F_HW_CSUM)
1324
1325 static void ipgre_tunnel_setup(struct net_device *dev)
1326 {
1327         dev->netdev_ops         = &ipgre_netdev_ops;
1328         dev->destructor         = ipgre_dev_free;
1329
1330         dev->type               = ARPHRD_IPGRE;
1331         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1332         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1333         dev->flags              = IFF_NOARP;
1334         dev->iflink             = 0;
1335         dev->addr_len           = 4;
1336         dev->features           |= NETIF_F_NETNS_LOCAL;
1337         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1338
1339         dev->features           |= GRE_FEATURES;
1340         dev->hw_features        |= GRE_FEATURES;
1341 }
1342
1343 static int ipgre_tunnel_init(struct net_device *dev)
1344 {
1345         struct ip_tunnel *tunnel;
1346         struct iphdr *iph;
1347
1348         tunnel = netdev_priv(dev);
1349         iph = &tunnel->parms.iph;
1350
1351         tunnel->dev = dev;
1352         strcpy(tunnel->parms.name, dev->name);
1353
1354         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1355         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1356
1357         if (iph->daddr) {
1358 #ifdef CONFIG_NET_IPGRE_BROADCAST
1359                 if (ipv4_is_multicast(iph->daddr)) {
1360                         if (!iph->saddr)
1361                                 return -EINVAL;
1362                         dev->flags = IFF_BROADCAST;
1363                         dev->header_ops = &ipgre_header_ops;
1364                 }
1365 #endif
1366         } else
1367                 dev->header_ops = &ipgre_header_ops;
1368
1369         dev->tstats = alloc_percpu(struct pcpu_tstats);
1370         if (!dev->tstats)
1371                 return -ENOMEM;
1372
1373         return 0;
1374 }
1375
1376 static void ipgre_fb_tunnel_init(struct net_device *dev)
1377 {
1378         struct ip_tunnel *tunnel = netdev_priv(dev);
1379         struct iphdr *iph = &tunnel->parms.iph;
1380
1381         tunnel->dev = dev;
1382         strcpy(tunnel->parms.name, dev->name);
1383
1384         iph->version            = 4;
1385         iph->protocol           = IPPROTO_GRE;
1386         iph->ihl                = 5;
1387         tunnel->hlen            = sizeof(struct iphdr) + 4;
1388
1389         dev_hold(dev);
1390 }
1391
1392
1393 static const struct gre_protocol ipgre_protocol = {
1394         .handler     = ipgre_rcv,
1395         .err_handler = ipgre_err,
1396 };
1397
1398 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1399 {
1400         int prio;
1401
1402         for (prio = 0; prio < 4; prio++) {
1403                 int h;
1404                 for (h = 0; h < HASH_SIZE; h++) {
1405                         struct ip_tunnel *t;
1406
1407                         t = rtnl_dereference(ign->tunnels[prio][h]);
1408
1409                         while (t != NULL) {
1410                                 unregister_netdevice_queue(t->dev, head);
1411                                 t = rtnl_dereference(t->next);
1412                         }
1413                 }
1414         }
1415 }
1416
1417 static int __net_init ipgre_init_net(struct net *net)
1418 {
1419         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1420         int err;
1421
1422         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1423                                            ipgre_tunnel_setup);
1424         if (!ign->fb_tunnel_dev) {
1425                 err = -ENOMEM;
1426                 goto err_alloc_dev;
1427         }
1428         dev_net_set(ign->fb_tunnel_dev, net);
1429
1430         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1431         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1432
1433         if ((err = register_netdev(ign->fb_tunnel_dev)))
1434                 goto err_reg_dev;
1435
1436         rcu_assign_pointer(ign->tunnels_wc[0],
1437                            netdev_priv(ign->fb_tunnel_dev));
1438         return 0;
1439
1440 err_reg_dev:
1441         ipgre_dev_free(ign->fb_tunnel_dev);
1442 err_alloc_dev:
1443         return err;
1444 }
1445
1446 static void __net_exit ipgre_exit_net(struct net *net)
1447 {
1448         struct ipgre_net *ign;
1449         LIST_HEAD(list);
1450
1451         ign = net_generic(net, ipgre_net_id);
1452         rtnl_lock();
1453         ipgre_destroy_tunnels(ign, &list);
1454         unregister_netdevice_many(&list);
1455         rtnl_unlock();
1456 }
1457
1458 static struct pernet_operations ipgre_net_ops = {
1459         .init = ipgre_init_net,
1460         .exit = ipgre_exit_net,
1461         .id   = &ipgre_net_id,
1462         .size = sizeof(struct ipgre_net),
1463 };
1464
1465 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1466 {
1467         __be16 flags;
1468
1469         if (!data)
1470                 return 0;
1471
1472         flags = 0;
1473         if (data[IFLA_GRE_IFLAGS])
1474                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1475         if (data[IFLA_GRE_OFLAGS])
1476                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1477         if (flags & (GRE_VERSION|GRE_ROUTING))
1478                 return -EINVAL;
1479
1480         return 0;
1481 }
1482
1483 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1484 {
1485         __be32 daddr;
1486
1487         if (tb[IFLA_ADDRESS]) {
1488                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1489                         return -EINVAL;
1490                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1491                         return -EADDRNOTAVAIL;
1492         }
1493
1494         if (!data)
1495                 goto out;
1496
1497         if (data[IFLA_GRE_REMOTE]) {
1498                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1499                 if (!daddr)
1500                         return -EINVAL;
1501         }
1502
1503 out:
1504         return ipgre_tunnel_validate(tb, data);
1505 }
1506
1507 static void ipgre_netlink_parms(struct nlattr *data[],
1508                                 struct ip_tunnel_parm *parms)
1509 {
1510         memset(parms, 0, sizeof(*parms));
1511
1512         parms->iph.protocol = IPPROTO_GRE;
1513
1514         if (!data)
1515                 return;
1516
1517         if (data[IFLA_GRE_LINK])
1518                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1519
1520         if (data[IFLA_GRE_IFLAGS])
1521                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1522
1523         if (data[IFLA_GRE_OFLAGS])
1524                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1525
1526         if (data[IFLA_GRE_IKEY])
1527                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1528
1529         if (data[IFLA_GRE_OKEY])
1530                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1531
1532         if (data[IFLA_GRE_LOCAL])
1533                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1534
1535         if (data[IFLA_GRE_REMOTE])
1536                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1537
1538         if (data[IFLA_GRE_TTL])
1539                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1540
1541         if (data[IFLA_GRE_TOS])
1542                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1543
1544         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1545                 parms->iph.frag_off = htons(IP_DF);
1546 }
1547
1548 static int ipgre_tap_init(struct net_device *dev)
1549 {
1550         struct ip_tunnel *tunnel;
1551
1552         tunnel = netdev_priv(dev);
1553
1554         tunnel->dev = dev;
1555         strcpy(tunnel->parms.name, dev->name);
1556
1557         ipgre_tunnel_bind_dev(dev);
1558
1559         dev->tstats = alloc_percpu(struct pcpu_tstats);
1560         if (!dev->tstats)
1561                 return -ENOMEM;
1562
1563         return 0;
1564 }
1565
1566 static const struct net_device_ops ipgre_tap_netdev_ops = {
1567         .ndo_init               = ipgre_tap_init,
1568         .ndo_uninit             = ipgre_tunnel_uninit,
1569         .ndo_start_xmit         = ipgre_tunnel_xmit,
1570         .ndo_set_mac_address    = eth_mac_addr,
1571         .ndo_validate_addr      = eth_validate_addr,
1572         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1573         .ndo_get_stats64        = ipgre_get_stats64,
1574 };
1575
1576 static void ipgre_tap_setup(struct net_device *dev)
1577 {
1578
1579         ether_setup(dev);
1580
1581         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1582         dev->destructor         = ipgre_dev_free;
1583
1584         dev->iflink             = 0;
1585         dev->features           |= NETIF_F_NETNS_LOCAL;
1586 }
1587
1588 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1589                          struct nlattr *data[])
1590 {
1591         struct ip_tunnel *nt;
1592         struct net *net = dev_net(dev);
1593         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1594         int mtu;
1595         int err;
1596
1597         nt = netdev_priv(dev);
1598         ipgre_netlink_parms(data, &nt->parms);
1599
1600         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1601                 return -EEXIST;
1602
1603         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1604                 eth_hw_addr_random(dev);
1605
1606         mtu = ipgre_tunnel_bind_dev(dev);
1607         if (!tb[IFLA_MTU])
1608                 dev->mtu = mtu;
1609
1610         /* Can use a lockless transmit, unless we generate output sequences */
1611         if (!(nt->parms.o_flags & GRE_SEQ))
1612                 dev->features |= NETIF_F_LLTX;
1613
1614         err = register_netdevice(dev);
1615         if (err)
1616                 goto out;
1617
1618         dev_hold(dev);
1619         ipgre_tunnel_link(ign, nt);
1620
1621 out:
1622         return err;
1623 }
1624
1625 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1626                             struct nlattr *data[])
1627 {
1628         struct ip_tunnel *t, *nt;
1629         struct net *net = dev_net(dev);
1630         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1631         struct ip_tunnel_parm p;
1632         int mtu;
1633
1634         if (dev == ign->fb_tunnel_dev)
1635                 return -EINVAL;
1636
1637         nt = netdev_priv(dev);
1638         ipgre_netlink_parms(data, &p);
1639
1640         t = ipgre_tunnel_locate(net, &p, 0);
1641
1642         if (t) {
1643                 if (t->dev != dev)
1644                         return -EEXIST;
1645         } else {
1646                 t = nt;
1647
1648                 if (dev->type != ARPHRD_ETHER) {
1649                         unsigned int nflags = 0;
1650
1651                         if (ipv4_is_multicast(p.iph.daddr))
1652                                 nflags = IFF_BROADCAST;
1653                         else if (p.iph.daddr)
1654                                 nflags = IFF_POINTOPOINT;
1655
1656                         if ((dev->flags ^ nflags) &
1657                             (IFF_POINTOPOINT | IFF_BROADCAST))
1658                                 return -EINVAL;
1659                 }
1660
1661                 ipgre_tunnel_unlink(ign, t);
1662                 t->parms.iph.saddr = p.iph.saddr;
1663                 t->parms.iph.daddr = p.iph.daddr;
1664                 t->parms.i_key = p.i_key;
1665                 if (dev->type != ARPHRD_ETHER) {
1666                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1667                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1668                 }
1669                 ipgre_tunnel_link(ign, t);
1670                 netdev_state_change(dev);
1671         }
1672
1673         t->parms.o_key = p.o_key;
1674         t->parms.iph.ttl = p.iph.ttl;
1675         t->parms.iph.tos = p.iph.tos;
1676         t->parms.iph.frag_off = p.iph.frag_off;
1677
1678         if (t->parms.link != p.link) {
1679                 t->parms.link = p.link;
1680                 mtu = ipgre_tunnel_bind_dev(dev);
1681                 if (!tb[IFLA_MTU])
1682                         dev->mtu = mtu;
1683                 netdev_state_change(dev);
1684         }
1685
1686         return 0;
1687 }
1688
1689 static size_t ipgre_get_size(const struct net_device *dev)
1690 {
1691         return
1692                 /* IFLA_GRE_LINK */
1693                 nla_total_size(4) +
1694                 /* IFLA_GRE_IFLAGS */
1695                 nla_total_size(2) +
1696                 /* IFLA_GRE_OFLAGS */
1697                 nla_total_size(2) +
1698                 /* IFLA_GRE_IKEY */
1699                 nla_total_size(4) +
1700                 /* IFLA_GRE_OKEY */
1701                 nla_total_size(4) +
1702                 /* IFLA_GRE_LOCAL */
1703                 nla_total_size(4) +
1704                 /* IFLA_GRE_REMOTE */
1705                 nla_total_size(4) +
1706                 /* IFLA_GRE_TTL */
1707                 nla_total_size(1) +
1708                 /* IFLA_GRE_TOS */
1709                 nla_total_size(1) +
1710                 /* IFLA_GRE_PMTUDISC */
1711                 nla_total_size(1) +
1712                 0;
1713 }
1714
1715 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1716 {
1717         struct ip_tunnel *t = netdev_priv(dev);
1718         struct ip_tunnel_parm *p = &t->parms;
1719
1720         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1721             nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1722             nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1723             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1724             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1725             nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1726             nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1727             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1728             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1729             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1730                        !!(p->iph.frag_off & htons(IP_DF))))
1731                 goto nla_put_failure;
1732         return 0;
1733
1734 nla_put_failure:
1735         return -EMSGSIZE;
1736 }
1737
1738 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1739         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1740         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1741         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1742         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1743         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1744         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1745         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1746         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1747         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1748         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1749 };
1750
1751 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1752         .kind           = "gre",
1753         .maxtype        = IFLA_GRE_MAX,
1754         .policy         = ipgre_policy,
1755         .priv_size      = sizeof(struct ip_tunnel),
1756         .setup          = ipgre_tunnel_setup,
1757         .validate       = ipgre_tunnel_validate,
1758         .newlink        = ipgre_newlink,
1759         .changelink     = ipgre_changelink,
1760         .get_size       = ipgre_get_size,
1761         .fill_info      = ipgre_fill_info,
1762 };
1763
1764 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1765         .kind           = "gretap",
1766         .maxtype        = IFLA_GRE_MAX,
1767         .policy         = ipgre_policy,
1768         .priv_size      = sizeof(struct ip_tunnel),
1769         .setup          = ipgre_tap_setup,
1770         .validate       = ipgre_tap_validate,
1771         .newlink        = ipgre_newlink,
1772         .changelink     = ipgre_changelink,
1773         .get_size       = ipgre_get_size,
1774         .fill_info      = ipgre_fill_info,
1775 };
1776
1777 /*
1778  *      And now the modules code and kernel interface.
1779  */
1780
1781 static int __init ipgre_init(void)
1782 {
1783         int err;
1784
1785         pr_info("GRE over IPv4 tunneling driver\n");
1786
1787         err = register_pernet_device(&ipgre_net_ops);
1788         if (err < 0)
1789                 return err;
1790
1791         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1792         if (err < 0) {
1793                 pr_info("%s: can't add protocol\n", __func__);
1794                 goto add_proto_failed;
1795         }
1796
1797         err = rtnl_link_register(&ipgre_link_ops);
1798         if (err < 0)
1799                 goto rtnl_link_failed;
1800
1801         err = rtnl_link_register(&ipgre_tap_ops);
1802         if (err < 0)
1803                 goto tap_ops_failed;
1804
1805 out:
1806         return err;
1807
1808 tap_ops_failed:
1809         rtnl_link_unregister(&ipgre_link_ops);
1810 rtnl_link_failed:
1811         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1812 add_proto_failed:
1813         unregister_pernet_device(&ipgre_net_ops);
1814         goto out;
1815 }
1816
1817 static void __exit ipgre_fini(void)
1818 {
1819         rtnl_link_unregister(&ipgre_tap_ops);
1820         rtnl_link_unregister(&ipgre_link_ops);
1821         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1822                 pr_info("%s: can't remove protocol\n", __func__);
1823         unregister_pernet_device(&ipgre_net_ops);
1824 }
1825
1826 module_init(ipgre_init);
1827 module_exit(ipgre_fini);
1828 MODULE_LICENSE("GPL");
1829 MODULE_ALIAS_RTNL_LINK("gre");
1830 MODULE_ALIAS_RTNL_LINK("gretap");
1831 MODULE_ALIAS_NETDEV("gre0");