]> rtime.felk.cvut.cz Git - can-eth-gw-linux.git/blob - net/ipv4/ip_gre.c
gre: add GSO support
[can-eth-gw-linux.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58    Problems & solutions
59    --------------------
60
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110
111
112
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119
120    Alexey Kuznetsov.
121  */
122
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127
128 /* Fallback tunnel: no source, no destination, no key, no options */
129
130 #define HASH_SIZE  16
131
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135
136         struct net_device *fb_tunnel_dev;
137 };
138
139 /* Tunnel hash table */
140
141 /*
142    4 hash tables:
143
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158
159 #define tunnels_r_l     tunnels[3]
160 #define tunnels_r       tunnels[2]
161 #define tunnels_l       tunnels[1]
162 #define tunnels_wc      tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166
167 #define for_each_ip_tunnel_rcu(start) \
168         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172         u64     rx_packets;
173         u64     rx_bytes;
174         u64     tx_packets;
175         u64     tx_bytes;
176         struct u64_stats_sync   syncp;
177 };
178
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180                                                    struct rtnl_link_stats64 *tot)
181 {
182         int i;
183
184         for_each_possible_cpu(i) {
185                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187                 unsigned int start;
188
189                 do {
190                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
191                         rx_packets = tstats->rx_packets;
192                         tx_packets = tstats->tx_packets;
193                         rx_bytes = tstats->rx_bytes;
194                         tx_bytes = tstats->tx_bytes;
195                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196
197                 tot->rx_packets += rx_packets;
198                 tot->tx_packets += tx_packets;
199                 tot->rx_bytes   += rx_bytes;
200                 tot->tx_bytes   += tx_bytes;
201         }
202
203         tot->multicast = dev->stats.multicast;
204         tot->rx_crc_errors = dev->stats.rx_crc_errors;
205         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206         tot->rx_length_errors = dev->stats.rx_length_errors;
207         tot->rx_errors = dev->stats.rx_errors;
208         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210         tot->tx_dropped = dev->stats.tx_dropped;
211         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212         tot->tx_errors = dev->stats.tx_errors;
213
214         return tot;
215 }
216
217 /* Given src, dst and key, find appropriate for input tunnel. */
218
219 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220                                              __be32 remote, __be32 local,
221                                              __be32 key, __be16 gre_proto)
222 {
223         struct net *net = dev_net(dev);
224         int link = dev->ifindex;
225         unsigned int h0 = HASH(remote);
226         unsigned int h1 = HASH(key);
227         struct ip_tunnel *t, *cand = NULL;
228         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
229         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
230                        ARPHRD_ETHER : ARPHRD_IPGRE;
231         int score, cand_score = 4;
232
233         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234                 if (local != t->parms.iph.saddr ||
235                     remote != t->parms.iph.daddr ||
236                     key != t->parms.i_key ||
237                     !(t->dev->flags & IFF_UP))
238                         continue;
239
240                 if (t->dev->type != ARPHRD_IPGRE &&
241                     t->dev->type != dev_type)
242                         continue;
243
244                 score = 0;
245                 if (t->parms.link != link)
246                         score |= 1;
247                 if (t->dev->type != dev_type)
248                         score |= 2;
249                 if (score == 0)
250                         return t;
251
252                 if (score < cand_score) {
253                         cand = t;
254                         cand_score = score;
255                 }
256         }
257
258         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259                 if (remote != t->parms.iph.daddr ||
260                     key != t->parms.i_key ||
261                     !(t->dev->flags & IFF_UP))
262                         continue;
263
264                 if (t->dev->type != ARPHRD_IPGRE &&
265                     t->dev->type != dev_type)
266                         continue;
267
268                 score = 0;
269                 if (t->parms.link != link)
270                         score |= 1;
271                 if (t->dev->type != dev_type)
272                         score |= 2;
273                 if (score == 0)
274                         return t;
275
276                 if (score < cand_score) {
277                         cand = t;
278                         cand_score = score;
279                 }
280         }
281
282         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
283                 if ((local != t->parms.iph.saddr &&
284                      (local != t->parms.iph.daddr ||
285                       !ipv4_is_multicast(local))) ||
286                     key != t->parms.i_key ||
287                     !(t->dev->flags & IFF_UP))
288                         continue;
289
290                 if (t->dev->type != ARPHRD_IPGRE &&
291                     t->dev->type != dev_type)
292                         continue;
293
294                 score = 0;
295                 if (t->parms.link != link)
296                         score |= 1;
297                 if (t->dev->type != dev_type)
298                         score |= 2;
299                 if (score == 0)
300                         return t;
301
302                 if (score < cand_score) {
303                         cand = t;
304                         cand_score = score;
305                 }
306         }
307
308         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
309                 if (t->parms.i_key != key ||
310                     !(t->dev->flags & IFF_UP))
311                         continue;
312
313                 if (t->dev->type != ARPHRD_IPGRE &&
314                     t->dev->type != dev_type)
315                         continue;
316
317                 score = 0;
318                 if (t->parms.link != link)
319                         score |= 1;
320                 if (t->dev->type != dev_type)
321                         score |= 2;
322                 if (score == 0)
323                         return t;
324
325                 if (score < cand_score) {
326                         cand = t;
327                         cand_score = score;
328                 }
329         }
330
331         if (cand != NULL)
332                 return cand;
333
334         dev = ign->fb_tunnel_dev;
335         if (dev->flags & IFF_UP)
336                 return netdev_priv(dev);
337
338         return NULL;
339 }
340
341 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
342                 struct ip_tunnel_parm *parms)
343 {
344         __be32 remote = parms->iph.daddr;
345         __be32 local = parms->iph.saddr;
346         __be32 key = parms->i_key;
347         unsigned int h = HASH(key);
348         int prio = 0;
349
350         if (local)
351                 prio |= 1;
352         if (remote && !ipv4_is_multicast(remote)) {
353                 prio |= 2;
354                 h ^= HASH(remote);
355         }
356
357         return &ign->tunnels[prio][h];
358 }
359
360 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
361                 struct ip_tunnel *t)
362 {
363         return __ipgre_bucket(ign, &t->parms);
364 }
365
366 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
367 {
368         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
369
370         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
371         rcu_assign_pointer(*tp, t);
372 }
373
374 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
375 {
376         struct ip_tunnel __rcu **tp;
377         struct ip_tunnel *iter;
378
379         for (tp = ipgre_bucket(ign, t);
380              (iter = rtnl_dereference(*tp)) != NULL;
381              tp = &iter->next) {
382                 if (t == iter) {
383                         rcu_assign_pointer(*tp, t->next);
384                         break;
385                 }
386         }
387 }
388
389 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
390                                            struct ip_tunnel_parm *parms,
391                                            int type)
392 {
393         __be32 remote = parms->iph.daddr;
394         __be32 local = parms->iph.saddr;
395         __be32 key = parms->i_key;
396         int link = parms->link;
397         struct ip_tunnel *t;
398         struct ip_tunnel __rcu **tp;
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         for (tp = __ipgre_bucket(ign, parms);
402              (t = rtnl_dereference(*tp)) != NULL;
403              tp = &t->next)
404                 if (local == t->parms.iph.saddr &&
405                     remote == t->parms.iph.daddr &&
406                     key == t->parms.i_key &&
407                     link == t->parms.link &&
408                     type == t->dev->type)
409                         break;
410
411         return t;
412 }
413
414 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
415                 struct ip_tunnel_parm *parms, int create)
416 {
417         struct ip_tunnel *t, *nt;
418         struct net_device *dev;
419         char name[IFNAMSIZ];
420         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
421
422         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
423         if (t || !create)
424                 return t;
425
426         if (parms->name[0])
427                 strlcpy(name, parms->name, IFNAMSIZ);
428         else
429                 strcpy(name, "gre%d");
430
431         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
432         if (!dev)
433                 return NULL;
434
435         dev_net_set(dev, net);
436
437         nt = netdev_priv(dev);
438         nt->parms = *parms;
439         dev->rtnl_link_ops = &ipgre_link_ops;
440
441         dev->mtu = ipgre_tunnel_bind_dev(dev);
442
443         if (register_netdevice(dev) < 0)
444                 goto failed_free;
445
446         /* Can use a lockless transmit, unless we generate output sequences */
447         if (!(nt->parms.o_flags & GRE_SEQ))
448                 dev->features |= NETIF_F_LLTX;
449
450         dev_hold(dev);
451         ipgre_tunnel_link(ign, nt);
452         return nt;
453
454 failed_free:
455         free_netdev(dev);
456         return NULL;
457 }
458
459 static void ipgre_tunnel_uninit(struct net_device *dev)
460 {
461         struct net *net = dev_net(dev);
462         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
463
464         ipgre_tunnel_unlink(ign, netdev_priv(dev));
465         dev_put(dev);
466 }
467
468
469 static void ipgre_err(struct sk_buff *skb, u32 info)
470 {
471
472 /* All the routers (except for Linux) return only
473    8 bytes of packet payload. It means, that precise relaying of
474    ICMP in the real Internet is absolutely infeasible.
475
476    Moreover, Cisco "wise men" put GRE key to the third word
477    in GRE header. It makes impossible maintaining even soft state for keyed
478    GRE tunnels with enabled checksum. Tell them "thank you".
479
480    Well, I wonder, rfc1812 was written by Cisco employee,
481    what the hell these idiots break standards established
482    by themselves???
483  */
484
485         const struct iphdr *iph = (const struct iphdr *)skb->data;
486         __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
487         int grehlen = (iph->ihl<<2) + 4;
488         const int type = icmp_hdr(skb)->type;
489         const int code = icmp_hdr(skb)->code;
490         struct ip_tunnel *t;
491         __be16 flags;
492
493         flags = p[0];
494         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
495                 if (flags&(GRE_VERSION|GRE_ROUTING))
496                         return;
497                 if (flags&GRE_KEY) {
498                         grehlen += 4;
499                         if (flags&GRE_CSUM)
500                                 grehlen += 4;
501                 }
502         }
503
504         /* If only 8 bytes returned, keyed message will be dropped here */
505         if (skb_headlen(skb) < grehlen)
506                 return;
507
508         switch (type) {
509         default:
510         case ICMP_PARAMETERPROB:
511                 return;
512
513         case ICMP_DEST_UNREACH:
514                 switch (code) {
515                 case ICMP_SR_FAILED:
516                 case ICMP_PORT_UNREACH:
517                         /* Impossible event. */
518                         return;
519                 default:
520                         /* All others are translated to HOST_UNREACH.
521                            rfc2003 contains "deep thoughts" about NET_UNREACH,
522                            I believe they are just ether pollution. --ANK
523                          */
524                         break;
525                 }
526                 break;
527         case ICMP_TIME_EXCEEDED:
528                 if (code != ICMP_EXC_TTL)
529                         return;
530                 break;
531
532         case ICMP_REDIRECT:
533                 break;
534         }
535
536         rcu_read_lock();
537         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538                                 flags & GRE_KEY ?
539                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540                                 p[1]);
541         if (t == NULL)
542                 goto out;
543
544         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
545                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
546                                  t->parms.link, 0, IPPROTO_GRE, 0);
547                 goto out;
548         }
549         if (type == ICMP_REDIRECT) {
550                 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
551                               IPPROTO_GRE, 0);
552                 goto out;
553         }
554         if (t->parms.iph.daddr == 0 ||
555             ipv4_is_multicast(t->parms.iph.daddr))
556                 goto out;
557
558         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
559                 goto out;
560
561         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
562                 t->err_count++;
563         else
564                 t->err_count = 1;
565         t->err_time = jiffies;
566 out:
567         rcu_read_unlock();
568 }
569
570 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
571 {
572         if (INET_ECN_is_ce(iph->tos)) {
573                 if (skb->protocol == htons(ETH_P_IP)) {
574                         IP_ECN_set_ce(ip_hdr(skb));
575                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
576                         IP6_ECN_set_ce(ipv6_hdr(skb));
577                 }
578         }
579 }
580
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583 {
584         u8 inner = 0;
585         if (skb->protocol == htons(ETH_P_IP))
586                 inner = old_iph->tos;
587         else if (skb->protocol == htons(ETH_P_IPV6))
588                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589         return INET_ECN_encapsulate(tos, inner);
590 }
591
592 static int ipgre_rcv(struct sk_buff *skb)
593 {
594         const struct iphdr *iph;
595         u8     *h;
596         __be16    flags;
597         __sum16   csum = 0;
598         __be32 key = 0;
599         u32    seqno = 0;
600         struct ip_tunnel *tunnel;
601         int    offset = 4;
602         __be16 gre_proto;
603
604         if (!pskb_may_pull(skb, 16))
605                 goto drop_nolock;
606
607         iph = ip_hdr(skb);
608         h = skb->data;
609         flags = *(__be16 *)h;
610
611         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
612                 /* - Version must be 0.
613                    - We do not support routing headers.
614                  */
615                 if (flags&(GRE_VERSION|GRE_ROUTING))
616                         goto drop_nolock;
617
618                 if (flags&GRE_CSUM) {
619                         switch (skb->ip_summed) {
620                         case CHECKSUM_COMPLETE:
621                                 csum = csum_fold(skb->csum);
622                                 if (!csum)
623                                         break;
624                                 /* fall through */
625                         case CHECKSUM_NONE:
626                                 skb->csum = 0;
627                                 csum = __skb_checksum_complete(skb);
628                                 skb->ip_summed = CHECKSUM_COMPLETE;
629                         }
630                         offset += 4;
631                 }
632                 if (flags&GRE_KEY) {
633                         key = *(__be32 *)(h + offset);
634                         offset += 4;
635                 }
636                 if (flags&GRE_SEQ) {
637                         seqno = ntohl(*(__be32 *)(h + offset));
638                         offset += 4;
639                 }
640         }
641
642         gre_proto = *(__be16 *)(h + 2);
643
644         rcu_read_lock();
645         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
646                                           iph->saddr, iph->daddr, key,
647                                           gre_proto))) {
648                 struct pcpu_tstats *tstats;
649
650                 secpath_reset(skb);
651
652                 skb->protocol = gre_proto;
653                 /* WCCP version 1 and 2 protocol decoding.
654                  * - Change protocol to IP
655                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
656                  */
657                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
658                         skb->protocol = htons(ETH_P_IP);
659                         if ((*(h + offset) & 0xF0) != 0x40)
660                                 offset += 4;
661                 }
662
663                 skb->mac_header = skb->network_header;
664                 __pskb_pull(skb, offset);
665                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
666                 skb->pkt_type = PACKET_HOST;
667 #ifdef CONFIG_NET_IPGRE_BROADCAST
668                 if (ipv4_is_multicast(iph->daddr)) {
669                         /* Looped back packet, drop it! */
670                         if (rt_is_output_route(skb_rtable(skb)))
671                                 goto drop;
672                         tunnel->dev->stats.multicast++;
673                         skb->pkt_type = PACKET_BROADCAST;
674                 }
675 #endif
676
677                 if (((flags&GRE_CSUM) && csum) ||
678                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
679                         tunnel->dev->stats.rx_crc_errors++;
680                         tunnel->dev->stats.rx_errors++;
681                         goto drop;
682                 }
683                 if (tunnel->parms.i_flags&GRE_SEQ) {
684                         if (!(flags&GRE_SEQ) ||
685                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
686                                 tunnel->dev->stats.rx_fifo_errors++;
687                                 tunnel->dev->stats.rx_errors++;
688                                 goto drop;
689                         }
690                         tunnel->i_seqno = seqno + 1;
691                 }
692
693                 /* Warning: All skb pointers will be invalidated! */
694                 if (tunnel->dev->type == ARPHRD_ETHER) {
695                         if (!pskb_may_pull(skb, ETH_HLEN)) {
696                                 tunnel->dev->stats.rx_length_errors++;
697                                 tunnel->dev->stats.rx_errors++;
698                                 goto drop;
699                         }
700
701                         iph = ip_hdr(skb);
702                         skb->protocol = eth_type_trans(skb, tunnel->dev);
703                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
704                 }
705
706                 tstats = this_cpu_ptr(tunnel->dev->tstats);
707                 u64_stats_update_begin(&tstats->syncp);
708                 tstats->rx_packets++;
709                 tstats->rx_bytes += skb->len;
710                 u64_stats_update_end(&tstats->syncp);
711
712                 __skb_tunnel_rx(skb, tunnel->dev);
713
714                 skb_reset_network_header(skb);
715                 ipgre_ecn_decapsulate(iph, skb);
716
717                 netif_rx(skb);
718
719                 rcu_read_unlock();
720                 return 0;
721         }
722         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
723
724 drop:
725         rcu_read_unlock();
726 drop_nolock:
727         kfree_skb(skb);
728         return 0;
729 }
730
731 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
732 {
733         struct ip_tunnel *tunnel = netdev_priv(dev);
734         struct pcpu_tstats *tstats;
735         const struct iphdr  *old_iph = ip_hdr(skb);
736         const struct iphdr  *tiph;
737         struct flowi4 fl4;
738         u8     tos;
739         __be16 df;
740         struct rtable *rt;                      /* Route to the other host */
741         struct net_device *tdev;                /* Device to other host */
742         struct iphdr  *iph;                     /* Our new IP header */
743         unsigned int max_headroom;              /* The extra header space needed */
744         int    gre_hlen;
745         __be32 dst;
746         int    mtu;
747
748         if (skb->ip_summed == CHECKSUM_PARTIAL &&
749             skb_checksum_help(skb))
750                 goto tx_error;
751
752         if (dev->type == ARPHRD_ETHER)
753                 IPCB(skb)->flags = 0;
754
755         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
756                 gre_hlen = 0;
757                 tiph = (const struct iphdr *)skb->data;
758         } else {
759                 gre_hlen = tunnel->hlen;
760                 tiph = &tunnel->parms.iph;
761         }
762
763         if ((dst = tiph->daddr) == 0) {
764                 /* NBMA tunnel */
765
766                 if (skb_dst(skb) == NULL) {
767                         dev->stats.tx_fifo_errors++;
768                         goto tx_error;
769                 }
770
771                 if (skb->protocol == htons(ETH_P_IP)) {
772                         rt = skb_rtable(skb);
773                         dst = rt_nexthop(rt, old_iph->daddr);
774                 }
775 #if IS_ENABLED(CONFIG_IPV6)
776                 else if (skb->protocol == htons(ETH_P_IPV6)) {
777                         const struct in6_addr *addr6;
778                         struct neighbour *neigh;
779                         bool do_tx_error_icmp;
780                         int addr_type;
781
782                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
783                         if (neigh == NULL)
784                                 goto tx_error;
785
786                         addr6 = (const struct in6_addr *)&neigh->primary_key;
787                         addr_type = ipv6_addr_type(addr6);
788
789                         if (addr_type == IPV6_ADDR_ANY) {
790                                 addr6 = &ipv6_hdr(skb)->daddr;
791                                 addr_type = ipv6_addr_type(addr6);
792                         }
793
794                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
795                                 do_tx_error_icmp = true;
796                         else {
797                                 do_tx_error_icmp = false;
798                                 dst = addr6->s6_addr32[3];
799                         }
800                         neigh_release(neigh);
801                         if (do_tx_error_icmp)
802                                 goto tx_error_icmp;
803                 }
804 #endif
805                 else
806                         goto tx_error;
807         }
808
809         tos = tiph->tos;
810         if (tos == 1) {
811                 tos = 0;
812                 if (skb->protocol == htons(ETH_P_IP))
813                         tos = old_iph->tos;
814                 else if (skb->protocol == htons(ETH_P_IPV6))
815                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
816         }
817
818         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
819                                  tunnel->parms.o_key, RT_TOS(tos),
820                                  tunnel->parms.link);
821         if (IS_ERR(rt)) {
822                 dev->stats.tx_carrier_errors++;
823                 goto tx_error;
824         }
825         tdev = rt->dst.dev;
826
827         if (tdev == dev) {
828                 ip_rt_put(rt);
829                 dev->stats.collisions++;
830                 goto tx_error;
831         }
832
833         df = tiph->frag_off;
834         if (df)
835                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
836         else
837                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
838
839         if (skb_dst(skb))
840                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
841
842         if (skb->protocol == htons(ETH_P_IP)) {
843                 df |= (old_iph->frag_off&htons(IP_DF));
844
845                 if ((old_iph->frag_off&htons(IP_DF)) &&
846                     mtu < ntohs(old_iph->tot_len)) {
847                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
848                         ip_rt_put(rt);
849                         goto tx_error;
850                 }
851         }
852 #if IS_ENABLED(CONFIG_IPV6)
853         else if (skb->protocol == htons(ETH_P_IPV6)) {
854                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
855
856                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
857                         if ((tunnel->parms.iph.daddr &&
858                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
859                             rt6->rt6i_dst.plen == 128) {
860                                 rt6->rt6i_flags |= RTF_MODIFIED;
861                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
862                         }
863                 }
864
865                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
866                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
867                         ip_rt_put(rt);
868                         goto tx_error;
869                 }
870         }
871 #endif
872
873         if (tunnel->err_count > 0) {
874                 if (time_before(jiffies,
875                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
876                         tunnel->err_count--;
877
878                         dst_link_failure(skb);
879                 } else
880                         tunnel->err_count = 0;
881         }
882
883         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
884
885         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
886             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
887                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
888                 if (max_headroom > dev->needed_headroom)
889                         dev->needed_headroom = max_headroom;
890                 if (!new_skb) {
891                         ip_rt_put(rt);
892                         dev->stats.tx_dropped++;
893                         dev_kfree_skb(skb);
894                         return NETDEV_TX_OK;
895                 }
896                 if (skb->sk)
897                         skb_set_owner_w(new_skb, skb->sk);
898                 dev_kfree_skb(skb);
899                 skb = new_skb;
900                 old_iph = ip_hdr(skb);
901         }
902
903         skb_reset_transport_header(skb);
904         skb_push(skb, gre_hlen);
905         skb_reset_network_header(skb);
906         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
907         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
908                               IPSKB_REROUTED);
909         skb_dst_drop(skb);
910         skb_dst_set(skb, &rt->dst);
911
912         /*
913          *      Push down and install the IPIP header.
914          */
915
916         iph                     =       ip_hdr(skb);
917         iph->version            =       4;
918         iph->ihl                =       sizeof(struct iphdr) >> 2;
919         iph->frag_off           =       df;
920         iph->protocol           =       IPPROTO_GRE;
921         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
922         iph->daddr              =       fl4.daddr;
923         iph->saddr              =       fl4.saddr;
924
925         if ((iph->ttl = tiph->ttl) == 0) {
926                 if (skb->protocol == htons(ETH_P_IP))
927                         iph->ttl = old_iph->ttl;
928 #if IS_ENABLED(CONFIG_IPV6)
929                 else if (skb->protocol == htons(ETH_P_IPV6))
930                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
931 #endif
932                 else
933                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
934         }
935
936         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
937         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
938                                    htons(ETH_P_TEB) : skb->protocol;
939
940         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
941                 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
942
943                 if (tunnel->parms.o_flags&GRE_SEQ) {
944                         ++tunnel->o_seqno;
945                         *ptr = htonl(tunnel->o_seqno);
946                         ptr--;
947                 }
948                 if (tunnel->parms.o_flags&GRE_KEY) {
949                         *ptr = tunnel->parms.o_key;
950                         ptr--;
951                 }
952                 if (tunnel->parms.o_flags&GRE_CSUM) {
953                         *ptr = 0;
954                         *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
955                 }
956         }
957
958         nf_reset(skb);
959         tstats = this_cpu_ptr(dev->tstats);
960         __IPTUNNEL_XMIT(tstats, &dev->stats);
961         return NETDEV_TX_OK;
962
963 #if IS_ENABLED(CONFIG_IPV6)
964 tx_error_icmp:
965         dst_link_failure(skb);
966 #endif
967 tx_error:
968         dev->stats.tx_errors++;
969         dev_kfree_skb(skb);
970         return NETDEV_TX_OK;
971 }
972
973 static int ipgre_tunnel_bind_dev(struct net_device *dev)
974 {
975         struct net_device *tdev = NULL;
976         struct ip_tunnel *tunnel;
977         const struct iphdr *iph;
978         int hlen = LL_MAX_HEADER;
979         int mtu = ETH_DATA_LEN;
980         int addend = sizeof(struct iphdr) + 4;
981
982         tunnel = netdev_priv(dev);
983         iph = &tunnel->parms.iph;
984
985         /* Guess output device to choose reasonable mtu and needed_headroom */
986
987         if (iph->daddr) {
988                 struct flowi4 fl4;
989                 struct rtable *rt;
990
991                 rt = ip_route_output_gre(dev_net(dev), &fl4,
992                                          iph->daddr, iph->saddr,
993                                          tunnel->parms.o_key,
994                                          RT_TOS(iph->tos),
995                                          tunnel->parms.link);
996                 if (!IS_ERR(rt)) {
997                         tdev = rt->dst.dev;
998                         ip_rt_put(rt);
999                 }
1000
1001                 if (dev->type != ARPHRD_ETHER)
1002                         dev->flags |= IFF_POINTOPOINT;
1003         }
1004
1005         if (!tdev && tunnel->parms.link)
1006                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1007
1008         if (tdev) {
1009                 hlen = tdev->hard_header_len + tdev->needed_headroom;
1010                 mtu = tdev->mtu;
1011         }
1012         dev->iflink = tunnel->parms.link;
1013
1014         /* Precalculate GRE options length */
1015         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1016                 if (tunnel->parms.o_flags&GRE_CSUM)
1017                         addend += 4;
1018                 if (tunnel->parms.o_flags&GRE_KEY)
1019                         addend += 4;
1020                 if (tunnel->parms.o_flags&GRE_SEQ)
1021                         addend += 4;
1022         }
1023         dev->needed_headroom = addend + hlen;
1024         mtu -= dev->hard_header_len + addend;
1025
1026         if (mtu < 68)
1027                 mtu = 68;
1028
1029         tunnel->hlen = addend;
1030
1031         return mtu;
1032 }
1033
1034 static int
1035 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1036 {
1037         int err = 0;
1038         struct ip_tunnel_parm p;
1039         struct ip_tunnel *t;
1040         struct net *net = dev_net(dev);
1041         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1042
1043         switch (cmd) {
1044         case SIOCGETTUNNEL:
1045                 t = NULL;
1046                 if (dev == ign->fb_tunnel_dev) {
1047                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1048                                 err = -EFAULT;
1049                                 break;
1050                         }
1051                         t = ipgre_tunnel_locate(net, &p, 0);
1052                 }
1053                 if (t == NULL)
1054                         t = netdev_priv(dev);
1055                 memcpy(&p, &t->parms, sizeof(p));
1056                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1057                         err = -EFAULT;
1058                 break;
1059
1060         case SIOCADDTUNNEL:
1061         case SIOCCHGTUNNEL:
1062                 err = -EPERM;
1063                 if (!capable(CAP_NET_ADMIN))
1064                         goto done;
1065
1066                 err = -EFAULT;
1067                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1068                         goto done;
1069
1070                 err = -EINVAL;
1071                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1072                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1073                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1074                         goto done;
1075                 if (p.iph.ttl)
1076                         p.iph.frag_off |= htons(IP_DF);
1077
1078                 if (!(p.i_flags&GRE_KEY))
1079                         p.i_key = 0;
1080                 if (!(p.o_flags&GRE_KEY))
1081                         p.o_key = 0;
1082
1083                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1084
1085                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1086                         if (t != NULL) {
1087                                 if (t->dev != dev) {
1088                                         err = -EEXIST;
1089                                         break;
1090                                 }
1091                         } else {
1092                                 unsigned int nflags = 0;
1093
1094                                 t = netdev_priv(dev);
1095
1096                                 if (ipv4_is_multicast(p.iph.daddr))
1097                                         nflags = IFF_BROADCAST;
1098                                 else if (p.iph.daddr)
1099                                         nflags = IFF_POINTOPOINT;
1100
1101                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1102                                         err = -EINVAL;
1103                                         break;
1104                                 }
1105                                 ipgre_tunnel_unlink(ign, t);
1106                                 synchronize_net();
1107                                 t->parms.iph.saddr = p.iph.saddr;
1108                                 t->parms.iph.daddr = p.iph.daddr;
1109                                 t->parms.i_key = p.i_key;
1110                                 t->parms.o_key = p.o_key;
1111                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1112                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1113                                 ipgre_tunnel_link(ign, t);
1114                                 netdev_state_change(dev);
1115                         }
1116                 }
1117
1118                 if (t) {
1119                         err = 0;
1120                         if (cmd == SIOCCHGTUNNEL) {
1121                                 t->parms.iph.ttl = p.iph.ttl;
1122                                 t->parms.iph.tos = p.iph.tos;
1123                                 t->parms.iph.frag_off = p.iph.frag_off;
1124                                 if (t->parms.link != p.link) {
1125                                         t->parms.link = p.link;
1126                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1127                                         netdev_state_change(dev);
1128                                 }
1129                         }
1130                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1131                                 err = -EFAULT;
1132                 } else
1133                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1134                 break;
1135
1136         case SIOCDELTUNNEL:
1137                 err = -EPERM;
1138                 if (!capable(CAP_NET_ADMIN))
1139                         goto done;
1140
1141                 if (dev == ign->fb_tunnel_dev) {
1142                         err = -EFAULT;
1143                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1144                                 goto done;
1145                         err = -ENOENT;
1146                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1147                                 goto done;
1148                         err = -EPERM;
1149                         if (t == netdev_priv(ign->fb_tunnel_dev))
1150                                 goto done;
1151                         dev = t->dev;
1152                 }
1153                 unregister_netdevice(dev);
1154                 err = 0;
1155                 break;
1156
1157         default:
1158                 err = -EINVAL;
1159         }
1160
1161 done:
1162         return err;
1163 }
1164
1165 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1166 {
1167         struct ip_tunnel *tunnel = netdev_priv(dev);
1168         if (new_mtu < 68 ||
1169             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1170                 return -EINVAL;
1171         dev->mtu = new_mtu;
1172         return 0;
1173 }
1174
1175 /* Nice toy. Unfortunately, useless in real life :-)
1176    It allows to construct virtual multiprotocol broadcast "LAN"
1177    over the Internet, provided multicast routing is tuned.
1178
1179
1180    I have no idea was this bicycle invented before me,
1181    so that I had to set ARPHRD_IPGRE to a random value.
1182    I have an impression, that Cisco could make something similar,
1183    but this feature is apparently missing in IOS<=11.2(8).
1184
1185    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1186    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1187
1188    ping -t 255 224.66.66.66
1189
1190    If nobody answers, mbone does not work.
1191
1192    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1193    ip addr add 10.66.66.<somewhat>/24 dev Universe
1194    ifconfig Universe up
1195    ifconfig Universe add fe80::<Your_real_addr>/10
1196    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1197    ftp 10.66.66.66
1198    ...
1199    ftp fec0:6666:6666::193.233.7.65
1200    ...
1201
1202  */
1203
1204 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1205                         unsigned short type,
1206                         const void *daddr, const void *saddr, unsigned int len)
1207 {
1208         struct ip_tunnel *t = netdev_priv(dev);
1209         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1210         __be16 *p = (__be16 *)(iph+1);
1211
1212         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1213         p[0]            = t->parms.o_flags;
1214         p[1]            = htons(type);
1215
1216         /*
1217          *      Set the source hardware address.
1218          */
1219
1220         if (saddr)
1221                 memcpy(&iph->saddr, saddr, 4);
1222         if (daddr)
1223                 memcpy(&iph->daddr, daddr, 4);
1224         if (iph->daddr)
1225                 return t->hlen;
1226
1227         return -t->hlen;
1228 }
1229
1230 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1231 {
1232         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1233         memcpy(haddr, &iph->saddr, 4);
1234         return 4;
1235 }
1236
1237 static const struct header_ops ipgre_header_ops = {
1238         .create = ipgre_header,
1239         .parse  = ipgre_header_parse,
1240 };
1241
1242 #ifdef CONFIG_NET_IPGRE_BROADCAST
1243 static int ipgre_open(struct net_device *dev)
1244 {
1245         struct ip_tunnel *t = netdev_priv(dev);
1246
1247         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1248                 struct flowi4 fl4;
1249                 struct rtable *rt;
1250
1251                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1252                                          t->parms.iph.daddr,
1253                                          t->parms.iph.saddr,
1254                                          t->parms.o_key,
1255                                          RT_TOS(t->parms.iph.tos),
1256                                          t->parms.link);
1257                 if (IS_ERR(rt))
1258                         return -EADDRNOTAVAIL;
1259                 dev = rt->dst.dev;
1260                 ip_rt_put(rt);
1261                 if (__in_dev_get_rtnl(dev) == NULL)
1262                         return -EADDRNOTAVAIL;
1263                 t->mlink = dev->ifindex;
1264                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1265         }
1266         return 0;
1267 }
1268
1269 static int ipgre_close(struct net_device *dev)
1270 {
1271         struct ip_tunnel *t = netdev_priv(dev);
1272
1273         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1274                 struct in_device *in_dev;
1275                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1276                 if (in_dev)
1277                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1278         }
1279         return 0;
1280 }
1281
1282 #endif
1283
1284 static const struct net_device_ops ipgre_netdev_ops = {
1285         .ndo_init               = ipgre_tunnel_init,
1286         .ndo_uninit             = ipgre_tunnel_uninit,
1287 #ifdef CONFIG_NET_IPGRE_BROADCAST
1288         .ndo_open               = ipgre_open,
1289         .ndo_stop               = ipgre_close,
1290 #endif
1291         .ndo_start_xmit         = ipgre_tunnel_xmit,
1292         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1293         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1294         .ndo_get_stats64        = ipgre_get_stats64,
1295 };
1296
1297 static void ipgre_dev_free(struct net_device *dev)
1298 {
1299         free_percpu(dev->tstats);
1300         free_netdev(dev);
1301 }
1302
1303 #define GRE_FEATURES (NETIF_F_SG |              \
1304                       NETIF_F_FRAGLIST |        \
1305                       NETIF_F_HIGHDMA |         \
1306                       NETIF_F_HW_CSUM)
1307
1308 static void ipgre_tunnel_setup(struct net_device *dev)
1309 {
1310         dev->netdev_ops         = &ipgre_netdev_ops;
1311         dev->destructor         = ipgre_dev_free;
1312
1313         dev->type               = ARPHRD_IPGRE;
1314         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1315         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1316         dev->flags              = IFF_NOARP;
1317         dev->iflink             = 0;
1318         dev->addr_len           = 4;
1319         dev->features           |= NETIF_F_NETNS_LOCAL;
1320         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1321
1322         dev->features           |= GRE_FEATURES;
1323         dev->hw_features        |= GRE_FEATURES;
1324 }
1325
1326 static int ipgre_tunnel_init(struct net_device *dev)
1327 {
1328         struct ip_tunnel *tunnel;
1329         struct iphdr *iph;
1330
1331         tunnel = netdev_priv(dev);
1332         iph = &tunnel->parms.iph;
1333
1334         tunnel->dev = dev;
1335         strcpy(tunnel->parms.name, dev->name);
1336
1337         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1338         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1339
1340         if (iph->daddr) {
1341 #ifdef CONFIG_NET_IPGRE_BROADCAST
1342                 if (ipv4_is_multicast(iph->daddr)) {
1343                         if (!iph->saddr)
1344                                 return -EINVAL;
1345                         dev->flags = IFF_BROADCAST;
1346                         dev->header_ops = &ipgre_header_ops;
1347                 }
1348 #endif
1349         } else
1350                 dev->header_ops = &ipgre_header_ops;
1351
1352         dev->tstats = alloc_percpu(struct pcpu_tstats);
1353         if (!dev->tstats)
1354                 return -ENOMEM;
1355
1356         return 0;
1357 }
1358
1359 static void ipgre_fb_tunnel_init(struct net_device *dev)
1360 {
1361         struct ip_tunnel *tunnel = netdev_priv(dev);
1362         struct iphdr *iph = &tunnel->parms.iph;
1363
1364         tunnel->dev = dev;
1365         strcpy(tunnel->parms.name, dev->name);
1366
1367         iph->version            = 4;
1368         iph->protocol           = IPPROTO_GRE;
1369         iph->ihl                = 5;
1370         tunnel->hlen            = sizeof(struct iphdr) + 4;
1371
1372         dev_hold(dev);
1373 }
1374
1375
1376 static const struct gre_protocol ipgre_protocol = {
1377         .handler     = ipgre_rcv,
1378         .err_handler = ipgre_err,
1379 };
1380
1381 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1382 {
1383         int prio;
1384
1385         for (prio = 0; prio < 4; prio++) {
1386                 int h;
1387                 for (h = 0; h < HASH_SIZE; h++) {
1388                         struct ip_tunnel *t;
1389
1390                         t = rtnl_dereference(ign->tunnels[prio][h]);
1391
1392                         while (t != NULL) {
1393                                 unregister_netdevice_queue(t->dev, head);
1394                                 t = rtnl_dereference(t->next);
1395                         }
1396                 }
1397         }
1398 }
1399
1400 static int __net_init ipgre_init_net(struct net *net)
1401 {
1402         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1403         int err;
1404
1405         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1406                                            ipgre_tunnel_setup);
1407         if (!ign->fb_tunnel_dev) {
1408                 err = -ENOMEM;
1409                 goto err_alloc_dev;
1410         }
1411         dev_net_set(ign->fb_tunnel_dev, net);
1412
1413         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1414         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1415
1416         if ((err = register_netdev(ign->fb_tunnel_dev)))
1417                 goto err_reg_dev;
1418
1419         rcu_assign_pointer(ign->tunnels_wc[0],
1420                            netdev_priv(ign->fb_tunnel_dev));
1421         return 0;
1422
1423 err_reg_dev:
1424         ipgre_dev_free(ign->fb_tunnel_dev);
1425 err_alloc_dev:
1426         return err;
1427 }
1428
1429 static void __net_exit ipgre_exit_net(struct net *net)
1430 {
1431         struct ipgre_net *ign;
1432         LIST_HEAD(list);
1433
1434         ign = net_generic(net, ipgre_net_id);
1435         rtnl_lock();
1436         ipgre_destroy_tunnels(ign, &list);
1437         unregister_netdevice_many(&list);
1438         rtnl_unlock();
1439 }
1440
1441 static struct pernet_operations ipgre_net_ops = {
1442         .init = ipgre_init_net,
1443         .exit = ipgre_exit_net,
1444         .id   = &ipgre_net_id,
1445         .size = sizeof(struct ipgre_net),
1446 };
1447
1448 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1449 {
1450         __be16 flags;
1451
1452         if (!data)
1453                 return 0;
1454
1455         flags = 0;
1456         if (data[IFLA_GRE_IFLAGS])
1457                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1458         if (data[IFLA_GRE_OFLAGS])
1459                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1460         if (flags & (GRE_VERSION|GRE_ROUTING))
1461                 return -EINVAL;
1462
1463         return 0;
1464 }
1465
1466 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1467 {
1468         __be32 daddr;
1469
1470         if (tb[IFLA_ADDRESS]) {
1471                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1472                         return -EINVAL;
1473                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1474                         return -EADDRNOTAVAIL;
1475         }
1476
1477         if (!data)
1478                 goto out;
1479
1480         if (data[IFLA_GRE_REMOTE]) {
1481                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1482                 if (!daddr)
1483                         return -EINVAL;
1484         }
1485
1486 out:
1487         return ipgre_tunnel_validate(tb, data);
1488 }
1489
1490 static void ipgre_netlink_parms(struct nlattr *data[],
1491                                 struct ip_tunnel_parm *parms)
1492 {
1493         memset(parms, 0, sizeof(*parms));
1494
1495         parms->iph.protocol = IPPROTO_GRE;
1496
1497         if (!data)
1498                 return;
1499
1500         if (data[IFLA_GRE_LINK])
1501                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1502
1503         if (data[IFLA_GRE_IFLAGS])
1504                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1505
1506         if (data[IFLA_GRE_OFLAGS])
1507                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1508
1509         if (data[IFLA_GRE_IKEY])
1510                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1511
1512         if (data[IFLA_GRE_OKEY])
1513                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1514
1515         if (data[IFLA_GRE_LOCAL])
1516                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1517
1518         if (data[IFLA_GRE_REMOTE])
1519                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1520
1521         if (data[IFLA_GRE_TTL])
1522                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1523
1524         if (data[IFLA_GRE_TOS])
1525                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1526
1527         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1528                 parms->iph.frag_off = htons(IP_DF);
1529 }
1530
1531 static int ipgre_tap_init(struct net_device *dev)
1532 {
1533         struct ip_tunnel *tunnel;
1534
1535         tunnel = netdev_priv(dev);
1536
1537         tunnel->dev = dev;
1538         strcpy(tunnel->parms.name, dev->name);
1539
1540         ipgre_tunnel_bind_dev(dev);
1541
1542         dev->tstats = alloc_percpu(struct pcpu_tstats);
1543         if (!dev->tstats)
1544                 return -ENOMEM;
1545
1546         return 0;
1547 }
1548
1549 static const struct net_device_ops ipgre_tap_netdev_ops = {
1550         .ndo_init               = ipgre_tap_init,
1551         .ndo_uninit             = ipgre_tunnel_uninit,
1552         .ndo_start_xmit         = ipgre_tunnel_xmit,
1553         .ndo_set_mac_address    = eth_mac_addr,
1554         .ndo_validate_addr      = eth_validate_addr,
1555         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1556         .ndo_get_stats64        = ipgre_get_stats64,
1557 };
1558
1559 static void ipgre_tap_setup(struct net_device *dev)
1560 {
1561
1562         ether_setup(dev);
1563
1564         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1565         dev->destructor         = ipgre_dev_free;
1566
1567         dev->iflink             = 0;
1568         dev->features           |= NETIF_F_NETNS_LOCAL;
1569 }
1570
1571 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1572                          struct nlattr *data[])
1573 {
1574         struct ip_tunnel *nt;
1575         struct net *net = dev_net(dev);
1576         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1577         int mtu;
1578         int err;
1579
1580         nt = netdev_priv(dev);
1581         ipgre_netlink_parms(data, &nt->parms);
1582
1583         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1584                 return -EEXIST;
1585
1586         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1587                 eth_hw_addr_random(dev);
1588
1589         mtu = ipgre_tunnel_bind_dev(dev);
1590         if (!tb[IFLA_MTU])
1591                 dev->mtu = mtu;
1592
1593         /* Can use a lockless transmit, unless we generate output sequences */
1594         if (!(nt->parms.o_flags & GRE_SEQ))
1595                 dev->features |= NETIF_F_LLTX;
1596
1597         err = register_netdevice(dev);
1598         if (err)
1599                 goto out;
1600
1601         dev_hold(dev);
1602         ipgre_tunnel_link(ign, nt);
1603
1604 out:
1605         return err;
1606 }
1607
1608 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1609                             struct nlattr *data[])
1610 {
1611         struct ip_tunnel *t, *nt;
1612         struct net *net = dev_net(dev);
1613         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1614         struct ip_tunnel_parm p;
1615         int mtu;
1616
1617         if (dev == ign->fb_tunnel_dev)
1618                 return -EINVAL;
1619
1620         nt = netdev_priv(dev);
1621         ipgre_netlink_parms(data, &p);
1622
1623         t = ipgre_tunnel_locate(net, &p, 0);
1624
1625         if (t) {
1626                 if (t->dev != dev)
1627                         return -EEXIST;
1628         } else {
1629                 t = nt;
1630
1631                 if (dev->type != ARPHRD_ETHER) {
1632                         unsigned int nflags = 0;
1633
1634                         if (ipv4_is_multicast(p.iph.daddr))
1635                                 nflags = IFF_BROADCAST;
1636                         else if (p.iph.daddr)
1637                                 nflags = IFF_POINTOPOINT;
1638
1639                         if ((dev->flags ^ nflags) &
1640                             (IFF_POINTOPOINT | IFF_BROADCAST))
1641                                 return -EINVAL;
1642                 }
1643
1644                 ipgre_tunnel_unlink(ign, t);
1645                 t->parms.iph.saddr = p.iph.saddr;
1646                 t->parms.iph.daddr = p.iph.daddr;
1647                 t->parms.i_key = p.i_key;
1648                 if (dev->type != ARPHRD_ETHER) {
1649                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1650                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1651                 }
1652                 ipgre_tunnel_link(ign, t);
1653                 netdev_state_change(dev);
1654         }
1655
1656         t->parms.o_key = p.o_key;
1657         t->parms.iph.ttl = p.iph.ttl;
1658         t->parms.iph.tos = p.iph.tos;
1659         t->parms.iph.frag_off = p.iph.frag_off;
1660
1661         if (t->parms.link != p.link) {
1662                 t->parms.link = p.link;
1663                 mtu = ipgre_tunnel_bind_dev(dev);
1664                 if (!tb[IFLA_MTU])
1665                         dev->mtu = mtu;
1666                 netdev_state_change(dev);
1667         }
1668
1669         return 0;
1670 }
1671
1672 static size_t ipgre_get_size(const struct net_device *dev)
1673 {
1674         return
1675                 /* IFLA_GRE_LINK */
1676                 nla_total_size(4) +
1677                 /* IFLA_GRE_IFLAGS */
1678                 nla_total_size(2) +
1679                 /* IFLA_GRE_OFLAGS */
1680                 nla_total_size(2) +
1681                 /* IFLA_GRE_IKEY */
1682                 nla_total_size(4) +
1683                 /* IFLA_GRE_OKEY */
1684                 nla_total_size(4) +
1685                 /* IFLA_GRE_LOCAL */
1686                 nla_total_size(4) +
1687                 /* IFLA_GRE_REMOTE */
1688                 nla_total_size(4) +
1689                 /* IFLA_GRE_TTL */
1690                 nla_total_size(1) +
1691                 /* IFLA_GRE_TOS */
1692                 nla_total_size(1) +
1693                 /* IFLA_GRE_PMTUDISC */
1694                 nla_total_size(1) +
1695                 0;
1696 }
1697
1698 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1699 {
1700         struct ip_tunnel *t = netdev_priv(dev);
1701         struct ip_tunnel_parm *p = &t->parms;
1702
1703         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1704             nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1705             nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1706             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1707             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1708             nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1709             nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1710             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1711             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1712             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1713                        !!(p->iph.frag_off & htons(IP_DF))))
1714                 goto nla_put_failure;
1715         return 0;
1716
1717 nla_put_failure:
1718         return -EMSGSIZE;
1719 }
1720
1721 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1722         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1723         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1724         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1725         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1726         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1727         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1728         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1729         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1730         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1731         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1732 };
1733
1734 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1735         .kind           = "gre",
1736         .maxtype        = IFLA_GRE_MAX,
1737         .policy         = ipgre_policy,
1738         .priv_size      = sizeof(struct ip_tunnel),
1739         .setup          = ipgre_tunnel_setup,
1740         .validate       = ipgre_tunnel_validate,
1741         .newlink        = ipgre_newlink,
1742         .changelink     = ipgre_changelink,
1743         .get_size       = ipgre_get_size,
1744         .fill_info      = ipgre_fill_info,
1745 };
1746
1747 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1748         .kind           = "gretap",
1749         .maxtype        = IFLA_GRE_MAX,
1750         .policy         = ipgre_policy,
1751         .priv_size      = sizeof(struct ip_tunnel),
1752         .setup          = ipgre_tap_setup,
1753         .validate       = ipgre_tap_validate,
1754         .newlink        = ipgre_newlink,
1755         .changelink     = ipgre_changelink,
1756         .get_size       = ipgre_get_size,
1757         .fill_info      = ipgre_fill_info,
1758 };
1759
1760 /*
1761  *      And now the modules code and kernel interface.
1762  */
1763
1764 static int __init ipgre_init(void)
1765 {
1766         int err;
1767
1768         pr_info("GRE over IPv4 tunneling driver\n");
1769
1770         err = register_pernet_device(&ipgre_net_ops);
1771         if (err < 0)
1772                 return err;
1773
1774         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1775         if (err < 0) {
1776                 pr_info("%s: can't add protocol\n", __func__);
1777                 goto add_proto_failed;
1778         }
1779
1780         err = rtnl_link_register(&ipgre_link_ops);
1781         if (err < 0)
1782                 goto rtnl_link_failed;
1783
1784         err = rtnl_link_register(&ipgre_tap_ops);
1785         if (err < 0)
1786                 goto tap_ops_failed;
1787
1788 out:
1789         return err;
1790
1791 tap_ops_failed:
1792         rtnl_link_unregister(&ipgre_link_ops);
1793 rtnl_link_failed:
1794         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1795 add_proto_failed:
1796         unregister_pernet_device(&ipgre_net_ops);
1797         goto out;
1798 }
1799
1800 static void __exit ipgre_fini(void)
1801 {
1802         rtnl_link_unregister(&ipgre_tap_ops);
1803         rtnl_link_unregister(&ipgre_link_ops);
1804         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1805                 pr_info("%s: can't remove protocol\n", __func__);
1806         unregister_pernet_device(&ipgre_net_ops);
1807 }
1808
1809 module_init(ipgre_init);
1810 module_exit(ipgre_fini);
1811 MODULE_LICENSE("GPL");
1812 MODULE_ALIAS_RTNL_LINK("gre");
1813 MODULE_ALIAS_RTNL_LINK("gretap");
1814 MODULE_ALIAS_NETDEV("gre0");