]> rtime.felk.cvut.cz Git - can-eth-gw-linux.git/blob - net/ipv4/ip_gre.c
0d4c3832d490c7b17ae04a54cf8c4cbae1d02b54
[can-eth-gw-linux.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58    Problems & solutions
59    --------------------
60
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110
111
112
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119
120    Alexey Kuznetsov.
121  */
122
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127
128 /* Fallback tunnel: no source, no destination, no key, no options */
129
130 #define HASH_SIZE  16
131
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135
136         struct net_device *fb_tunnel_dev;
137 };
138
139 /* Tunnel hash table */
140
141 /*
142    4 hash tables:
143
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158
159 #define tunnels_r_l     tunnels[3]
160 #define tunnels_r       tunnels[2]
161 #define tunnels_l       tunnels[1]
162 #define tunnels_wc      tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166
167 #define for_each_ip_tunnel_rcu(start) \
168         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172         u64     rx_packets;
173         u64     rx_bytes;
174         u64     tx_packets;
175         u64     tx_bytes;
176         struct u64_stats_sync   syncp;
177 };
178
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180                                                    struct rtnl_link_stats64 *tot)
181 {
182         int i;
183
184         for_each_possible_cpu(i) {
185                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187                 unsigned int start;
188
189                 do {
190                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
191                         rx_packets = tstats->rx_packets;
192                         tx_packets = tstats->tx_packets;
193                         rx_bytes = tstats->rx_bytes;
194                         tx_bytes = tstats->tx_bytes;
195                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196
197                 tot->rx_packets += rx_packets;
198                 tot->tx_packets += tx_packets;
199                 tot->rx_bytes   += rx_bytes;
200                 tot->tx_bytes   += tx_bytes;
201         }
202
203         tot->multicast = dev->stats.multicast;
204         tot->rx_crc_errors = dev->stats.rx_crc_errors;
205         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206         tot->rx_length_errors = dev->stats.rx_length_errors;
207         tot->rx_errors = dev->stats.rx_errors;
208         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210         tot->tx_dropped = dev->stats.tx_dropped;
211         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212         tot->tx_errors = dev->stats.tx_errors;
213
214         return tot;
215 }
216
217 /* Does key in tunnel parameters match packet */
218 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
219                             __u32 flags, __be32 key)
220 {
221         if (p->i_flags & GRE_KEY) {
222                 if (flags & GRE_KEY)
223                         return key == p->i_key;
224                 else
225                         return false;   /* key expected, none present */
226         } else
227                 return !(flags & GRE_KEY);
228 }
229
230 /* Given src, dst and key, find appropriate for input tunnel. */
231
232 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
233                                              __be32 remote, __be32 local,
234                                              __u32 flags, __be32 key,
235                                              __be16 gre_proto)
236 {
237         struct net *net = dev_net(dev);
238         int link = dev->ifindex;
239         unsigned int h0 = HASH(remote);
240         unsigned int h1 = HASH(key);
241         struct ip_tunnel *t, *cand = NULL;
242         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
243         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
244                        ARPHRD_ETHER : ARPHRD_IPGRE;
245         int score, cand_score = 4;
246
247         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
248                 if (local != t->parms.iph.saddr ||
249                     remote != t->parms.iph.daddr ||
250                     !(t->dev->flags & IFF_UP))
251                         continue;
252
253                 if (!ipgre_key_match(&t->parms, flags, key))
254                         continue;
255
256                 if (t->dev->type != ARPHRD_IPGRE &&
257                     t->dev->type != dev_type)
258                         continue;
259
260                 score = 0;
261                 if (t->parms.link != link)
262                         score |= 1;
263                 if (t->dev->type != dev_type)
264                         score |= 2;
265                 if (score == 0)
266                         return t;
267
268                 if (score < cand_score) {
269                         cand = t;
270                         cand_score = score;
271                 }
272         }
273
274         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
275                 if (remote != t->parms.iph.daddr ||
276                     !(t->dev->flags & IFF_UP))
277                         continue;
278
279                 if (!ipgre_key_match(&t->parms, flags, key))
280                         continue;
281
282                 if (t->dev->type != ARPHRD_IPGRE &&
283                     t->dev->type != dev_type)
284                         continue;
285
286                 score = 0;
287                 if (t->parms.link != link)
288                         score |= 1;
289                 if (t->dev->type != dev_type)
290                         score |= 2;
291                 if (score == 0)
292                         return t;
293
294                 if (score < cand_score) {
295                         cand = t;
296                         cand_score = score;
297                 }
298         }
299
300         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
301                 if ((local != t->parms.iph.saddr &&
302                      (local != t->parms.iph.daddr ||
303                       !ipv4_is_multicast(local))) ||
304                     !(t->dev->flags & IFF_UP))
305                         continue;
306
307                 if (!ipgre_key_match(&t->parms, flags, key))
308                         continue;
309
310                 if (t->dev->type != ARPHRD_IPGRE &&
311                     t->dev->type != dev_type)
312                         continue;
313
314                 score = 0;
315                 if (t->parms.link != link)
316                         score |= 1;
317                 if (t->dev->type != dev_type)
318                         score |= 2;
319                 if (score == 0)
320                         return t;
321
322                 if (score < cand_score) {
323                         cand = t;
324                         cand_score = score;
325                 }
326         }
327
328         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
329                 if (t->parms.i_key != key ||
330                     !(t->dev->flags & IFF_UP))
331                         continue;
332
333                 if (t->dev->type != ARPHRD_IPGRE &&
334                     t->dev->type != dev_type)
335                         continue;
336
337                 score = 0;
338                 if (t->parms.link != link)
339                         score |= 1;
340                 if (t->dev->type != dev_type)
341                         score |= 2;
342                 if (score == 0)
343                         return t;
344
345                 if (score < cand_score) {
346                         cand = t;
347                         cand_score = score;
348                 }
349         }
350
351         if (cand != NULL)
352                 return cand;
353
354         dev = ign->fb_tunnel_dev;
355         if (dev->flags & IFF_UP)
356                 return netdev_priv(dev);
357
358         return NULL;
359 }
360
361 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
362                 struct ip_tunnel_parm *parms)
363 {
364         __be32 remote = parms->iph.daddr;
365         __be32 local = parms->iph.saddr;
366         __be32 key = parms->i_key;
367         unsigned int h = HASH(key);
368         int prio = 0;
369
370         if (local)
371                 prio |= 1;
372         if (remote && !ipv4_is_multicast(remote)) {
373                 prio |= 2;
374                 h ^= HASH(remote);
375         }
376
377         return &ign->tunnels[prio][h];
378 }
379
380 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
381                 struct ip_tunnel *t)
382 {
383         return __ipgre_bucket(ign, &t->parms);
384 }
385
386 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
387 {
388         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
389
390         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
391         rcu_assign_pointer(*tp, t);
392 }
393
394 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
395 {
396         struct ip_tunnel __rcu **tp;
397         struct ip_tunnel *iter;
398
399         for (tp = ipgre_bucket(ign, t);
400              (iter = rtnl_dereference(*tp)) != NULL;
401              tp = &iter->next) {
402                 if (t == iter) {
403                         rcu_assign_pointer(*tp, t->next);
404                         break;
405                 }
406         }
407 }
408
409 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
410                                            struct ip_tunnel_parm *parms,
411                                            int type)
412 {
413         __be32 remote = parms->iph.daddr;
414         __be32 local = parms->iph.saddr;
415         __be32 key = parms->i_key;
416         int link = parms->link;
417         struct ip_tunnel *t;
418         struct ip_tunnel __rcu **tp;
419         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
420
421         for (tp = __ipgre_bucket(ign, parms);
422              (t = rtnl_dereference(*tp)) != NULL;
423              tp = &t->next)
424                 if (local == t->parms.iph.saddr &&
425                     remote == t->parms.iph.daddr &&
426                     key == t->parms.i_key &&
427                     link == t->parms.link &&
428                     type == t->dev->type)
429                         break;
430
431         return t;
432 }
433
434 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
435                 struct ip_tunnel_parm *parms, int create)
436 {
437         struct ip_tunnel *t, *nt;
438         struct net_device *dev;
439         char name[IFNAMSIZ];
440         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
441
442         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
443         if (t || !create)
444                 return t;
445
446         if (parms->name[0])
447                 strlcpy(name, parms->name, IFNAMSIZ);
448         else
449                 strcpy(name, "gre%d");
450
451         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
452         if (!dev)
453                 return NULL;
454
455         dev_net_set(dev, net);
456
457         nt = netdev_priv(dev);
458         nt->parms = *parms;
459         dev->rtnl_link_ops = &ipgre_link_ops;
460
461         dev->mtu = ipgre_tunnel_bind_dev(dev);
462
463         if (register_netdevice(dev) < 0)
464                 goto failed_free;
465
466         /* Can use a lockless transmit, unless we generate output sequences */
467         if (!(nt->parms.o_flags & GRE_SEQ))
468                 dev->features |= NETIF_F_LLTX;
469
470         dev_hold(dev);
471         ipgre_tunnel_link(ign, nt);
472         return nt;
473
474 failed_free:
475         free_netdev(dev);
476         return NULL;
477 }
478
479 static void ipgre_tunnel_uninit(struct net_device *dev)
480 {
481         struct net *net = dev_net(dev);
482         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
483
484         ipgre_tunnel_unlink(ign, netdev_priv(dev));
485         dev_put(dev);
486 }
487
488
489 static void ipgre_err(struct sk_buff *skb, u32 info)
490 {
491
492 /* All the routers (except for Linux) return only
493    8 bytes of packet payload. It means, that precise relaying of
494    ICMP in the real Internet is absolutely infeasible.
495
496    Moreover, Cisco "wise men" put GRE key to the third word
497    in GRE header. It makes impossible maintaining even soft state for keyed
498    GRE tunnels with enabled checksum. Tell them "thank you".
499
500    Well, I wonder, rfc1812 was written by Cisco employee,
501    what the hell these idiots break standards established
502    by themselves???
503  */
504
505         const struct iphdr *iph = (const struct iphdr *)skb->data;
506         __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
507         int grehlen = (iph->ihl<<2) + 4;
508         const int type = icmp_hdr(skb)->type;
509         const int code = icmp_hdr(skb)->code;
510         struct ip_tunnel *t;
511         __be16 flags;
512         __be32 key = 0;
513
514         flags = p[0];
515         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
516                 if (flags&(GRE_VERSION|GRE_ROUTING))
517                         return;
518                 if (flags&GRE_KEY) {
519                         grehlen += 4;
520                         if (flags&GRE_CSUM)
521                                 grehlen += 4;
522                 }
523         }
524
525         /* If only 8 bytes returned, keyed message will be dropped here */
526         if (skb_headlen(skb) < grehlen)
527                 return;
528
529         if (flags & GRE_KEY)
530                 key = *(((__be32 *)p) + (grehlen / 4) - 1);
531
532         switch (type) {
533         default:
534         case ICMP_PARAMETERPROB:
535                 return;
536
537         case ICMP_DEST_UNREACH:
538                 switch (code) {
539                 case ICMP_SR_FAILED:
540                 case ICMP_PORT_UNREACH:
541                         /* Impossible event. */
542                         return;
543                 default:
544                         /* All others are translated to HOST_UNREACH.
545                            rfc2003 contains "deep thoughts" about NET_UNREACH,
546                            I believe they are just ether pollution. --ANK
547                          */
548                         break;
549                 }
550                 break;
551         case ICMP_TIME_EXCEEDED:
552                 if (code != ICMP_EXC_TTL)
553                         return;
554                 break;
555
556         case ICMP_REDIRECT:
557                 break;
558         }
559
560         rcu_read_lock();
561         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
562                                 flags, key, p[1]);
563
564         if (t == NULL)
565                 goto out;
566
567         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
568                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
569                                  t->parms.link, 0, IPPROTO_GRE, 0);
570                 goto out;
571         }
572         if (type == ICMP_REDIRECT) {
573                 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
574                               IPPROTO_GRE, 0);
575                 goto out;
576         }
577         if (t->parms.iph.daddr == 0 ||
578             ipv4_is_multicast(t->parms.iph.daddr))
579                 goto out;
580
581         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
582                 goto out;
583
584         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
585                 t->err_count++;
586         else
587                 t->err_count = 1;
588         t->err_time = jiffies;
589 out:
590         rcu_read_unlock();
591 }
592
593 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
594 {
595         if (INET_ECN_is_ce(iph->tos)) {
596                 if (skb->protocol == htons(ETH_P_IP)) {
597                         IP_ECN_set_ce(ip_hdr(skb));
598                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
599                         IP6_ECN_set_ce(ipv6_hdr(skb));
600                 }
601         }
602 }
603
604 static inline u8
605 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
606 {
607         u8 inner = 0;
608         if (skb->protocol == htons(ETH_P_IP))
609                 inner = old_iph->tos;
610         else if (skb->protocol == htons(ETH_P_IPV6))
611                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
612         return INET_ECN_encapsulate(tos, inner);
613 }
614
615 static int ipgre_rcv(struct sk_buff *skb)
616 {
617         const struct iphdr *iph;
618         u8     *h;
619         __be16    flags;
620         __sum16   csum = 0;
621         __be32 key = 0;
622         u32    seqno = 0;
623         struct ip_tunnel *tunnel;
624         int    offset = 4;
625         __be16 gre_proto;
626
627         if (!pskb_may_pull(skb, 16))
628                 goto drop_nolock;
629
630         iph = ip_hdr(skb);
631         h = skb->data;
632         flags = *(__be16 *)h;
633
634         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
635                 /* - Version must be 0.
636                    - We do not support routing headers.
637                  */
638                 if (flags&(GRE_VERSION|GRE_ROUTING))
639                         goto drop_nolock;
640
641                 if (flags&GRE_CSUM) {
642                         switch (skb->ip_summed) {
643                         case CHECKSUM_COMPLETE:
644                                 csum = csum_fold(skb->csum);
645                                 if (!csum)
646                                         break;
647                                 /* fall through */
648                         case CHECKSUM_NONE:
649                                 skb->csum = 0;
650                                 csum = __skb_checksum_complete(skb);
651                                 skb->ip_summed = CHECKSUM_COMPLETE;
652                         }
653                         offset += 4;
654                 }
655                 if (flags&GRE_KEY) {
656                         key = *(__be32 *)(h + offset);
657                         offset += 4;
658                 }
659                 if (flags&GRE_SEQ) {
660                         seqno = ntohl(*(__be32 *)(h + offset));
661                         offset += 4;
662                 }
663         }
664
665         gre_proto = *(__be16 *)(h + 2);
666
667         rcu_read_lock();
668         tunnel = ipgre_tunnel_lookup(skb->dev,
669                                      iph->saddr, iph->daddr, flags, key,
670                                      gre_proto);
671         if (tunnel) {
672                 struct pcpu_tstats *tstats;
673
674                 secpath_reset(skb);
675
676                 skb->protocol = gre_proto;
677                 /* WCCP version 1 and 2 protocol decoding.
678                  * - Change protocol to IP
679                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
680                  */
681                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
682                         skb->protocol = htons(ETH_P_IP);
683                         if ((*(h + offset) & 0xF0) != 0x40)
684                                 offset += 4;
685                 }
686
687                 skb->mac_header = skb->network_header;
688                 __pskb_pull(skb, offset);
689                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
690                 skb->pkt_type = PACKET_HOST;
691 #ifdef CONFIG_NET_IPGRE_BROADCAST
692                 if (ipv4_is_multicast(iph->daddr)) {
693                         /* Looped back packet, drop it! */
694                         if (rt_is_output_route(skb_rtable(skb)))
695                                 goto drop;
696                         tunnel->dev->stats.multicast++;
697                         skb->pkt_type = PACKET_BROADCAST;
698                 }
699 #endif
700
701                 if (((flags&GRE_CSUM) && csum) ||
702                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
703                         tunnel->dev->stats.rx_crc_errors++;
704                         tunnel->dev->stats.rx_errors++;
705                         goto drop;
706                 }
707                 if (tunnel->parms.i_flags&GRE_SEQ) {
708                         if (!(flags&GRE_SEQ) ||
709                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
710                                 tunnel->dev->stats.rx_fifo_errors++;
711                                 tunnel->dev->stats.rx_errors++;
712                                 goto drop;
713                         }
714                         tunnel->i_seqno = seqno + 1;
715                 }
716
717                 /* Warning: All skb pointers will be invalidated! */
718                 if (tunnel->dev->type == ARPHRD_ETHER) {
719                         if (!pskb_may_pull(skb, ETH_HLEN)) {
720                                 tunnel->dev->stats.rx_length_errors++;
721                                 tunnel->dev->stats.rx_errors++;
722                                 goto drop;
723                         }
724
725                         iph = ip_hdr(skb);
726                         skb->protocol = eth_type_trans(skb, tunnel->dev);
727                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
728                 }
729
730                 tstats = this_cpu_ptr(tunnel->dev->tstats);
731                 u64_stats_update_begin(&tstats->syncp);
732                 tstats->rx_packets++;
733                 tstats->rx_bytes += skb->len;
734                 u64_stats_update_end(&tstats->syncp);
735
736                 __skb_tunnel_rx(skb, tunnel->dev);
737
738                 skb_reset_network_header(skb);
739                 ipgre_ecn_decapsulate(iph, skb);
740
741                 netif_rx(skb);
742
743                 rcu_read_unlock();
744                 return 0;
745         }
746         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
747
748 drop:
749         rcu_read_unlock();
750 drop_nolock:
751         kfree_skb(skb);
752         return 0;
753 }
754
755 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
756 {
757         struct ip_tunnel *tunnel = netdev_priv(dev);
758         struct pcpu_tstats *tstats;
759         const struct iphdr  *old_iph = ip_hdr(skb);
760         const struct iphdr  *tiph;
761         struct flowi4 fl4;
762         u8     tos;
763         __be16 df;
764         struct rtable *rt;                      /* Route to the other host */
765         struct net_device *tdev;                /* Device to other host */
766         struct iphdr  *iph;                     /* Our new IP header */
767         unsigned int max_headroom;              /* The extra header space needed */
768         int    gre_hlen;
769         __be32 dst;
770         int    mtu;
771
772         if (skb->ip_summed == CHECKSUM_PARTIAL &&
773             skb_checksum_help(skb))
774                 goto tx_error;
775
776         if (dev->type == ARPHRD_ETHER)
777                 IPCB(skb)->flags = 0;
778
779         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
780                 gre_hlen = 0;
781                 tiph = (const struct iphdr *)skb->data;
782         } else {
783                 gre_hlen = tunnel->hlen;
784                 tiph = &tunnel->parms.iph;
785         }
786
787         if ((dst = tiph->daddr) == 0) {
788                 /* NBMA tunnel */
789
790                 if (skb_dst(skb) == NULL) {
791                         dev->stats.tx_fifo_errors++;
792                         goto tx_error;
793                 }
794
795                 if (skb->protocol == htons(ETH_P_IP)) {
796                         rt = skb_rtable(skb);
797                         dst = rt_nexthop(rt, old_iph->daddr);
798                 }
799 #if IS_ENABLED(CONFIG_IPV6)
800                 else if (skb->protocol == htons(ETH_P_IPV6)) {
801                         const struct in6_addr *addr6;
802                         struct neighbour *neigh;
803                         bool do_tx_error_icmp;
804                         int addr_type;
805
806                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
807                         if (neigh == NULL)
808                                 goto tx_error;
809
810                         addr6 = (const struct in6_addr *)&neigh->primary_key;
811                         addr_type = ipv6_addr_type(addr6);
812
813                         if (addr_type == IPV6_ADDR_ANY) {
814                                 addr6 = &ipv6_hdr(skb)->daddr;
815                                 addr_type = ipv6_addr_type(addr6);
816                         }
817
818                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
819                                 do_tx_error_icmp = true;
820                         else {
821                                 do_tx_error_icmp = false;
822                                 dst = addr6->s6_addr32[3];
823                         }
824                         neigh_release(neigh);
825                         if (do_tx_error_icmp)
826                                 goto tx_error_icmp;
827                 }
828 #endif
829                 else
830                         goto tx_error;
831         }
832
833         tos = tiph->tos;
834         if (tos == 1) {
835                 tos = 0;
836                 if (skb->protocol == htons(ETH_P_IP))
837                         tos = old_iph->tos;
838                 else if (skb->protocol == htons(ETH_P_IPV6))
839                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
840         }
841
842         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
843                                  tunnel->parms.o_key, RT_TOS(tos),
844                                  tunnel->parms.link);
845         if (IS_ERR(rt)) {
846                 dev->stats.tx_carrier_errors++;
847                 goto tx_error;
848         }
849         tdev = rt->dst.dev;
850
851         if (tdev == dev) {
852                 ip_rt_put(rt);
853                 dev->stats.collisions++;
854                 goto tx_error;
855         }
856
857         df = tiph->frag_off;
858         if (df)
859                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
860         else
861                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
862
863         if (skb_dst(skb))
864                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
865
866         if (skb->protocol == htons(ETH_P_IP)) {
867                 df |= (old_iph->frag_off&htons(IP_DF));
868
869                 if ((old_iph->frag_off&htons(IP_DF)) &&
870                     mtu < ntohs(old_iph->tot_len)) {
871                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
872                         ip_rt_put(rt);
873                         goto tx_error;
874                 }
875         }
876 #if IS_ENABLED(CONFIG_IPV6)
877         else if (skb->protocol == htons(ETH_P_IPV6)) {
878                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
879
880                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
881                         if ((tunnel->parms.iph.daddr &&
882                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
883                             rt6->rt6i_dst.plen == 128) {
884                                 rt6->rt6i_flags |= RTF_MODIFIED;
885                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
886                         }
887                 }
888
889                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
890                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
891                         ip_rt_put(rt);
892                         goto tx_error;
893                 }
894         }
895 #endif
896
897         if (tunnel->err_count > 0) {
898                 if (time_before(jiffies,
899                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
900                         tunnel->err_count--;
901
902                         dst_link_failure(skb);
903                 } else
904                         tunnel->err_count = 0;
905         }
906
907         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
908
909         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
910             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
911                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
912                 if (max_headroom > dev->needed_headroom)
913                         dev->needed_headroom = max_headroom;
914                 if (!new_skb) {
915                         ip_rt_put(rt);
916                         dev->stats.tx_dropped++;
917                         dev_kfree_skb(skb);
918                         return NETDEV_TX_OK;
919                 }
920                 if (skb->sk)
921                         skb_set_owner_w(new_skb, skb->sk);
922                 dev_kfree_skb(skb);
923                 skb = new_skb;
924                 old_iph = ip_hdr(skb);
925         }
926
927         skb_reset_transport_header(skb);
928         skb_push(skb, gre_hlen);
929         skb_reset_network_header(skb);
930         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
931         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
932                               IPSKB_REROUTED);
933         skb_dst_drop(skb);
934         skb_dst_set(skb, &rt->dst);
935
936         /*
937          *      Push down and install the IPIP header.
938          */
939
940         iph                     =       ip_hdr(skb);
941         iph->version            =       4;
942         iph->ihl                =       sizeof(struct iphdr) >> 2;
943         iph->frag_off           =       df;
944         iph->protocol           =       IPPROTO_GRE;
945         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
946         iph->daddr              =       fl4.daddr;
947         iph->saddr              =       fl4.saddr;
948
949         if ((iph->ttl = tiph->ttl) == 0) {
950                 if (skb->protocol == htons(ETH_P_IP))
951                         iph->ttl = old_iph->ttl;
952 #if IS_ENABLED(CONFIG_IPV6)
953                 else if (skb->protocol == htons(ETH_P_IPV6))
954                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
955 #endif
956                 else
957                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
958         }
959
960         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
961         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
962                                    htons(ETH_P_TEB) : skb->protocol;
963
964         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
965                 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
966
967                 if (tunnel->parms.o_flags&GRE_SEQ) {
968                         ++tunnel->o_seqno;
969                         *ptr = htonl(tunnel->o_seqno);
970                         ptr--;
971                 }
972                 if (tunnel->parms.o_flags&GRE_KEY) {
973                         *ptr = tunnel->parms.o_key;
974                         ptr--;
975                 }
976                 if (tunnel->parms.o_flags&GRE_CSUM) {
977                         *ptr = 0;
978                         *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
979                 }
980         }
981
982         nf_reset(skb);
983         tstats = this_cpu_ptr(dev->tstats);
984         __IPTUNNEL_XMIT(tstats, &dev->stats);
985         return NETDEV_TX_OK;
986
987 #if IS_ENABLED(CONFIG_IPV6)
988 tx_error_icmp:
989         dst_link_failure(skb);
990 #endif
991 tx_error:
992         dev->stats.tx_errors++;
993         dev_kfree_skb(skb);
994         return NETDEV_TX_OK;
995 }
996
997 static int ipgre_tunnel_bind_dev(struct net_device *dev)
998 {
999         struct net_device *tdev = NULL;
1000         struct ip_tunnel *tunnel;
1001         const struct iphdr *iph;
1002         int hlen = LL_MAX_HEADER;
1003         int mtu = ETH_DATA_LEN;
1004         int addend = sizeof(struct iphdr) + 4;
1005
1006         tunnel = netdev_priv(dev);
1007         iph = &tunnel->parms.iph;
1008
1009         /* Guess output device to choose reasonable mtu and needed_headroom */
1010
1011         if (iph->daddr) {
1012                 struct flowi4 fl4;
1013                 struct rtable *rt;
1014
1015                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1016                                          iph->daddr, iph->saddr,
1017                                          tunnel->parms.o_key,
1018                                          RT_TOS(iph->tos),
1019                                          tunnel->parms.link);
1020                 if (!IS_ERR(rt)) {
1021                         tdev = rt->dst.dev;
1022                         ip_rt_put(rt);
1023                 }
1024
1025                 if (dev->type != ARPHRD_ETHER)
1026                         dev->flags |= IFF_POINTOPOINT;
1027         }
1028
1029         if (!tdev && tunnel->parms.link)
1030                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1031
1032         if (tdev) {
1033                 hlen = tdev->hard_header_len + tdev->needed_headroom;
1034                 mtu = tdev->mtu;
1035         }
1036         dev->iflink = tunnel->parms.link;
1037
1038         /* Precalculate GRE options length */
1039         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1040                 if (tunnel->parms.o_flags&GRE_CSUM)
1041                         addend += 4;
1042                 if (tunnel->parms.o_flags&GRE_KEY)
1043                         addend += 4;
1044                 if (tunnel->parms.o_flags&GRE_SEQ)
1045                         addend += 4;
1046         }
1047         dev->needed_headroom = addend + hlen;
1048         mtu -= dev->hard_header_len + addend;
1049
1050         if (mtu < 68)
1051                 mtu = 68;
1052
1053         tunnel->hlen = addend;
1054
1055         return mtu;
1056 }
1057
1058 static int
1059 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1060 {
1061         int err = 0;
1062         struct ip_tunnel_parm p;
1063         struct ip_tunnel *t;
1064         struct net *net = dev_net(dev);
1065         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1066
1067         switch (cmd) {
1068         case SIOCGETTUNNEL:
1069                 t = NULL;
1070                 if (dev == ign->fb_tunnel_dev) {
1071                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1072                                 err = -EFAULT;
1073                                 break;
1074                         }
1075                         t = ipgre_tunnel_locate(net, &p, 0);
1076                 }
1077                 if (t == NULL)
1078                         t = netdev_priv(dev);
1079                 memcpy(&p, &t->parms, sizeof(p));
1080                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1081                         err = -EFAULT;
1082                 break;
1083
1084         case SIOCADDTUNNEL:
1085         case SIOCCHGTUNNEL:
1086                 err = -EPERM;
1087                 if (!capable(CAP_NET_ADMIN))
1088                         goto done;
1089
1090                 err = -EFAULT;
1091                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1092                         goto done;
1093
1094                 err = -EINVAL;
1095                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1096                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1097                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1098                         goto done;
1099                 if (p.iph.ttl)
1100                         p.iph.frag_off |= htons(IP_DF);
1101
1102                 if (!(p.i_flags&GRE_KEY))
1103                         p.i_key = 0;
1104                 if (!(p.o_flags&GRE_KEY))
1105                         p.o_key = 0;
1106
1107                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1108
1109                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1110                         if (t != NULL) {
1111                                 if (t->dev != dev) {
1112                                         err = -EEXIST;
1113                                         break;
1114                                 }
1115                         } else {
1116                                 unsigned int nflags = 0;
1117
1118                                 t = netdev_priv(dev);
1119
1120                                 if (ipv4_is_multicast(p.iph.daddr))
1121                                         nflags = IFF_BROADCAST;
1122                                 else if (p.iph.daddr)
1123                                         nflags = IFF_POINTOPOINT;
1124
1125                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1126                                         err = -EINVAL;
1127                                         break;
1128                                 }
1129                                 ipgre_tunnel_unlink(ign, t);
1130                                 synchronize_net();
1131                                 t->parms.iph.saddr = p.iph.saddr;
1132                                 t->parms.iph.daddr = p.iph.daddr;
1133                                 t->parms.i_key = p.i_key;
1134                                 t->parms.o_key = p.o_key;
1135                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1136                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1137                                 ipgre_tunnel_link(ign, t);
1138                                 netdev_state_change(dev);
1139                         }
1140                 }
1141
1142                 if (t) {
1143                         err = 0;
1144                         if (cmd == SIOCCHGTUNNEL) {
1145                                 t->parms.iph.ttl = p.iph.ttl;
1146                                 t->parms.iph.tos = p.iph.tos;
1147                                 t->parms.iph.frag_off = p.iph.frag_off;
1148                                 if (t->parms.link != p.link) {
1149                                         t->parms.link = p.link;
1150                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1151                                         netdev_state_change(dev);
1152                                 }
1153                         }
1154                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1155                                 err = -EFAULT;
1156                 } else
1157                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1158                 break;
1159
1160         case SIOCDELTUNNEL:
1161                 err = -EPERM;
1162                 if (!capable(CAP_NET_ADMIN))
1163                         goto done;
1164
1165                 if (dev == ign->fb_tunnel_dev) {
1166                         err = -EFAULT;
1167                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1168                                 goto done;
1169                         err = -ENOENT;
1170                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1171                                 goto done;
1172                         err = -EPERM;
1173                         if (t == netdev_priv(ign->fb_tunnel_dev))
1174                                 goto done;
1175                         dev = t->dev;
1176                 }
1177                 unregister_netdevice(dev);
1178                 err = 0;
1179                 break;
1180
1181         default:
1182                 err = -EINVAL;
1183         }
1184
1185 done:
1186         return err;
1187 }
1188
1189 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1190 {
1191         struct ip_tunnel *tunnel = netdev_priv(dev);
1192         if (new_mtu < 68 ||
1193             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1194                 return -EINVAL;
1195         dev->mtu = new_mtu;
1196         return 0;
1197 }
1198
1199 /* Nice toy. Unfortunately, useless in real life :-)
1200    It allows to construct virtual multiprotocol broadcast "LAN"
1201    over the Internet, provided multicast routing is tuned.
1202
1203
1204    I have no idea was this bicycle invented before me,
1205    so that I had to set ARPHRD_IPGRE to a random value.
1206    I have an impression, that Cisco could make something similar,
1207    but this feature is apparently missing in IOS<=11.2(8).
1208
1209    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1210    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1211
1212    ping -t 255 224.66.66.66
1213
1214    If nobody answers, mbone does not work.
1215
1216    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1217    ip addr add 10.66.66.<somewhat>/24 dev Universe
1218    ifconfig Universe up
1219    ifconfig Universe add fe80::<Your_real_addr>/10
1220    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1221    ftp 10.66.66.66
1222    ...
1223    ftp fec0:6666:6666::193.233.7.65
1224    ...
1225
1226  */
1227
1228 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1229                         unsigned short type,
1230                         const void *daddr, const void *saddr, unsigned int len)
1231 {
1232         struct ip_tunnel *t = netdev_priv(dev);
1233         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1234         __be16 *p = (__be16 *)(iph+1);
1235
1236         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1237         p[0]            = t->parms.o_flags;
1238         p[1]            = htons(type);
1239
1240         /*
1241          *      Set the source hardware address.
1242          */
1243
1244         if (saddr)
1245                 memcpy(&iph->saddr, saddr, 4);
1246         if (daddr)
1247                 memcpy(&iph->daddr, daddr, 4);
1248         if (iph->daddr)
1249                 return t->hlen;
1250
1251         return -t->hlen;
1252 }
1253
1254 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1255 {
1256         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1257         memcpy(haddr, &iph->saddr, 4);
1258         return 4;
1259 }
1260
1261 static const struct header_ops ipgre_header_ops = {
1262         .create = ipgre_header,
1263         .parse  = ipgre_header_parse,
1264 };
1265
1266 #ifdef CONFIG_NET_IPGRE_BROADCAST
1267 static int ipgre_open(struct net_device *dev)
1268 {
1269         struct ip_tunnel *t = netdev_priv(dev);
1270
1271         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1272                 struct flowi4 fl4;
1273                 struct rtable *rt;
1274
1275                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1276                                          t->parms.iph.daddr,
1277                                          t->parms.iph.saddr,
1278                                          t->parms.o_key,
1279                                          RT_TOS(t->parms.iph.tos),
1280                                          t->parms.link);
1281                 if (IS_ERR(rt))
1282                         return -EADDRNOTAVAIL;
1283                 dev = rt->dst.dev;
1284                 ip_rt_put(rt);
1285                 if (__in_dev_get_rtnl(dev) == NULL)
1286                         return -EADDRNOTAVAIL;
1287                 t->mlink = dev->ifindex;
1288                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1289         }
1290         return 0;
1291 }
1292
1293 static int ipgre_close(struct net_device *dev)
1294 {
1295         struct ip_tunnel *t = netdev_priv(dev);
1296
1297         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1298                 struct in_device *in_dev;
1299                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1300                 if (in_dev)
1301                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1302         }
1303         return 0;
1304 }
1305
1306 #endif
1307
1308 static const struct net_device_ops ipgre_netdev_ops = {
1309         .ndo_init               = ipgre_tunnel_init,
1310         .ndo_uninit             = ipgre_tunnel_uninit,
1311 #ifdef CONFIG_NET_IPGRE_BROADCAST
1312         .ndo_open               = ipgre_open,
1313         .ndo_stop               = ipgre_close,
1314 #endif
1315         .ndo_start_xmit         = ipgre_tunnel_xmit,
1316         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1317         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1318         .ndo_get_stats64        = ipgre_get_stats64,
1319 };
1320
1321 static void ipgre_dev_free(struct net_device *dev)
1322 {
1323         free_percpu(dev->tstats);
1324         free_netdev(dev);
1325 }
1326
1327 #define GRE_FEATURES (NETIF_F_SG |              \
1328                       NETIF_F_FRAGLIST |        \
1329                       NETIF_F_HIGHDMA |         \
1330                       NETIF_F_HW_CSUM)
1331
1332 static void ipgre_tunnel_setup(struct net_device *dev)
1333 {
1334         dev->netdev_ops         = &ipgre_netdev_ops;
1335         dev->destructor         = ipgre_dev_free;
1336
1337         dev->type               = ARPHRD_IPGRE;
1338         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1339         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1340         dev->flags              = IFF_NOARP;
1341         dev->iflink             = 0;
1342         dev->addr_len           = 4;
1343         dev->features           |= NETIF_F_NETNS_LOCAL;
1344         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1345
1346         dev->features           |= GRE_FEATURES;
1347         dev->hw_features        |= GRE_FEATURES;
1348 }
1349
1350 static int ipgre_tunnel_init(struct net_device *dev)
1351 {
1352         struct ip_tunnel *tunnel;
1353         struct iphdr *iph;
1354
1355         tunnel = netdev_priv(dev);
1356         iph = &tunnel->parms.iph;
1357
1358         tunnel->dev = dev;
1359         strcpy(tunnel->parms.name, dev->name);
1360
1361         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1362         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1363
1364         if (iph->daddr) {
1365 #ifdef CONFIG_NET_IPGRE_BROADCAST
1366                 if (ipv4_is_multicast(iph->daddr)) {
1367                         if (!iph->saddr)
1368                                 return -EINVAL;
1369                         dev->flags = IFF_BROADCAST;
1370                         dev->header_ops = &ipgre_header_ops;
1371                 }
1372 #endif
1373         } else
1374                 dev->header_ops = &ipgre_header_ops;
1375
1376         dev->tstats = alloc_percpu(struct pcpu_tstats);
1377         if (!dev->tstats)
1378                 return -ENOMEM;
1379
1380         return 0;
1381 }
1382
1383 static void ipgre_fb_tunnel_init(struct net_device *dev)
1384 {
1385         struct ip_tunnel *tunnel = netdev_priv(dev);
1386         struct iphdr *iph = &tunnel->parms.iph;
1387
1388         tunnel->dev = dev;
1389         strcpy(tunnel->parms.name, dev->name);
1390
1391         iph->version            = 4;
1392         iph->protocol           = IPPROTO_GRE;
1393         iph->ihl                = 5;
1394         tunnel->hlen            = sizeof(struct iphdr) + 4;
1395
1396         dev_hold(dev);
1397 }
1398
1399
1400 static const struct gre_protocol ipgre_protocol = {
1401         .handler     = ipgre_rcv,
1402         .err_handler = ipgre_err,
1403 };
1404
1405 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1406 {
1407         int prio;
1408
1409         for (prio = 0; prio < 4; prio++) {
1410                 int h;
1411                 for (h = 0; h < HASH_SIZE; h++) {
1412                         struct ip_tunnel *t;
1413
1414                         t = rtnl_dereference(ign->tunnels[prio][h]);
1415
1416                         while (t != NULL) {
1417                                 unregister_netdevice_queue(t->dev, head);
1418                                 t = rtnl_dereference(t->next);
1419                         }
1420                 }
1421         }
1422 }
1423
1424 static int __net_init ipgre_init_net(struct net *net)
1425 {
1426         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1427         int err;
1428
1429         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1430                                            ipgre_tunnel_setup);
1431         if (!ign->fb_tunnel_dev) {
1432                 err = -ENOMEM;
1433                 goto err_alloc_dev;
1434         }
1435         dev_net_set(ign->fb_tunnel_dev, net);
1436
1437         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1438         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1439
1440         if ((err = register_netdev(ign->fb_tunnel_dev)))
1441                 goto err_reg_dev;
1442
1443         rcu_assign_pointer(ign->tunnels_wc[0],
1444                            netdev_priv(ign->fb_tunnel_dev));
1445         return 0;
1446
1447 err_reg_dev:
1448         ipgre_dev_free(ign->fb_tunnel_dev);
1449 err_alloc_dev:
1450         return err;
1451 }
1452
1453 static void __net_exit ipgre_exit_net(struct net *net)
1454 {
1455         struct ipgre_net *ign;
1456         LIST_HEAD(list);
1457
1458         ign = net_generic(net, ipgre_net_id);
1459         rtnl_lock();
1460         ipgre_destroy_tunnels(ign, &list);
1461         unregister_netdevice_many(&list);
1462         rtnl_unlock();
1463 }
1464
1465 static struct pernet_operations ipgre_net_ops = {
1466         .init = ipgre_init_net,
1467         .exit = ipgre_exit_net,
1468         .id   = &ipgre_net_id,
1469         .size = sizeof(struct ipgre_net),
1470 };
1471
1472 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1473 {
1474         __be16 flags;
1475
1476         if (!data)
1477                 return 0;
1478
1479         flags = 0;
1480         if (data[IFLA_GRE_IFLAGS])
1481                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1482         if (data[IFLA_GRE_OFLAGS])
1483                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1484         if (flags & (GRE_VERSION|GRE_ROUTING))
1485                 return -EINVAL;
1486
1487         return 0;
1488 }
1489
1490 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1491 {
1492         __be32 daddr;
1493
1494         if (tb[IFLA_ADDRESS]) {
1495                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1496                         return -EINVAL;
1497                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1498                         return -EADDRNOTAVAIL;
1499         }
1500
1501         if (!data)
1502                 goto out;
1503
1504         if (data[IFLA_GRE_REMOTE]) {
1505                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1506                 if (!daddr)
1507                         return -EINVAL;
1508         }
1509
1510 out:
1511         return ipgre_tunnel_validate(tb, data);
1512 }
1513
1514 static void ipgre_netlink_parms(struct nlattr *data[],
1515                                 struct ip_tunnel_parm *parms)
1516 {
1517         memset(parms, 0, sizeof(*parms));
1518
1519         parms->iph.protocol = IPPROTO_GRE;
1520
1521         if (!data)
1522                 return;
1523
1524         if (data[IFLA_GRE_LINK])
1525                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1526
1527         if (data[IFLA_GRE_IFLAGS])
1528                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1529
1530         if (data[IFLA_GRE_OFLAGS])
1531                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1532
1533         if (data[IFLA_GRE_IKEY])
1534                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1535
1536         if (data[IFLA_GRE_OKEY])
1537                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1538
1539         if (data[IFLA_GRE_LOCAL])
1540                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1541
1542         if (data[IFLA_GRE_REMOTE])
1543                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1544
1545         if (data[IFLA_GRE_TTL])
1546                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1547
1548         if (data[IFLA_GRE_TOS])
1549                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1550
1551         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1552                 parms->iph.frag_off = htons(IP_DF);
1553 }
1554
1555 static int ipgre_tap_init(struct net_device *dev)
1556 {
1557         struct ip_tunnel *tunnel;
1558
1559         tunnel = netdev_priv(dev);
1560
1561         tunnel->dev = dev;
1562         strcpy(tunnel->parms.name, dev->name);
1563
1564         ipgre_tunnel_bind_dev(dev);
1565
1566         dev->tstats = alloc_percpu(struct pcpu_tstats);
1567         if (!dev->tstats)
1568                 return -ENOMEM;
1569
1570         return 0;
1571 }
1572
1573 static const struct net_device_ops ipgre_tap_netdev_ops = {
1574         .ndo_init               = ipgre_tap_init,
1575         .ndo_uninit             = ipgre_tunnel_uninit,
1576         .ndo_start_xmit         = ipgre_tunnel_xmit,
1577         .ndo_set_mac_address    = eth_mac_addr,
1578         .ndo_validate_addr      = eth_validate_addr,
1579         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1580         .ndo_get_stats64        = ipgre_get_stats64,
1581 };
1582
1583 static void ipgre_tap_setup(struct net_device *dev)
1584 {
1585
1586         ether_setup(dev);
1587
1588         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1589         dev->destructor         = ipgre_dev_free;
1590
1591         dev->iflink             = 0;
1592         dev->features           |= NETIF_F_NETNS_LOCAL;
1593 }
1594
1595 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1596                          struct nlattr *data[])
1597 {
1598         struct ip_tunnel *nt;
1599         struct net *net = dev_net(dev);
1600         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1601         int mtu;
1602         int err;
1603
1604         nt = netdev_priv(dev);
1605         ipgre_netlink_parms(data, &nt->parms);
1606
1607         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1608                 return -EEXIST;
1609
1610         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1611                 eth_hw_addr_random(dev);
1612
1613         mtu = ipgre_tunnel_bind_dev(dev);
1614         if (!tb[IFLA_MTU])
1615                 dev->mtu = mtu;
1616
1617         /* Can use a lockless transmit, unless we generate output sequences */
1618         if (!(nt->parms.o_flags & GRE_SEQ))
1619                 dev->features |= NETIF_F_LLTX;
1620
1621         err = register_netdevice(dev);
1622         if (err)
1623                 goto out;
1624
1625         dev_hold(dev);
1626         ipgre_tunnel_link(ign, nt);
1627
1628 out:
1629         return err;
1630 }
1631
1632 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1633                             struct nlattr *data[])
1634 {
1635         struct ip_tunnel *t, *nt;
1636         struct net *net = dev_net(dev);
1637         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1638         struct ip_tunnel_parm p;
1639         int mtu;
1640
1641         if (dev == ign->fb_tunnel_dev)
1642                 return -EINVAL;
1643
1644         nt = netdev_priv(dev);
1645         ipgre_netlink_parms(data, &p);
1646
1647         t = ipgre_tunnel_locate(net, &p, 0);
1648
1649         if (t) {
1650                 if (t->dev != dev)
1651                         return -EEXIST;
1652         } else {
1653                 t = nt;
1654
1655                 if (dev->type != ARPHRD_ETHER) {
1656                         unsigned int nflags = 0;
1657
1658                         if (ipv4_is_multicast(p.iph.daddr))
1659                                 nflags = IFF_BROADCAST;
1660                         else if (p.iph.daddr)
1661                                 nflags = IFF_POINTOPOINT;
1662
1663                         if ((dev->flags ^ nflags) &
1664                             (IFF_POINTOPOINT | IFF_BROADCAST))
1665                                 return -EINVAL;
1666                 }
1667
1668                 ipgre_tunnel_unlink(ign, t);
1669                 t->parms.iph.saddr = p.iph.saddr;
1670                 t->parms.iph.daddr = p.iph.daddr;
1671                 t->parms.i_key = p.i_key;
1672                 if (dev->type != ARPHRD_ETHER) {
1673                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1674                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1675                 }
1676                 ipgre_tunnel_link(ign, t);
1677                 netdev_state_change(dev);
1678         }
1679
1680         t->parms.o_key = p.o_key;
1681         t->parms.iph.ttl = p.iph.ttl;
1682         t->parms.iph.tos = p.iph.tos;
1683         t->parms.iph.frag_off = p.iph.frag_off;
1684
1685         if (t->parms.link != p.link) {
1686                 t->parms.link = p.link;
1687                 mtu = ipgre_tunnel_bind_dev(dev);
1688                 if (!tb[IFLA_MTU])
1689                         dev->mtu = mtu;
1690                 netdev_state_change(dev);
1691         }
1692
1693         return 0;
1694 }
1695
1696 static size_t ipgre_get_size(const struct net_device *dev)
1697 {
1698         return
1699                 /* IFLA_GRE_LINK */
1700                 nla_total_size(4) +
1701                 /* IFLA_GRE_IFLAGS */
1702                 nla_total_size(2) +
1703                 /* IFLA_GRE_OFLAGS */
1704                 nla_total_size(2) +
1705                 /* IFLA_GRE_IKEY */
1706                 nla_total_size(4) +
1707                 /* IFLA_GRE_OKEY */
1708                 nla_total_size(4) +
1709                 /* IFLA_GRE_LOCAL */
1710                 nla_total_size(4) +
1711                 /* IFLA_GRE_REMOTE */
1712                 nla_total_size(4) +
1713                 /* IFLA_GRE_TTL */
1714                 nla_total_size(1) +
1715                 /* IFLA_GRE_TOS */
1716                 nla_total_size(1) +
1717                 /* IFLA_GRE_PMTUDISC */
1718                 nla_total_size(1) +
1719                 0;
1720 }
1721
1722 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1723 {
1724         struct ip_tunnel *t = netdev_priv(dev);
1725         struct ip_tunnel_parm *p = &t->parms;
1726
1727         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1728             nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1729             nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1730             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1731             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1732             nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1733             nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1734             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1735             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1736             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1737                        !!(p->iph.frag_off & htons(IP_DF))))
1738                 goto nla_put_failure;
1739         return 0;
1740
1741 nla_put_failure:
1742         return -EMSGSIZE;
1743 }
1744
1745 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1746         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1747         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1748         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1749         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1750         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1751         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1752         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1753         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1754         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1755         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1756 };
1757
1758 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1759         .kind           = "gre",
1760         .maxtype        = IFLA_GRE_MAX,
1761         .policy         = ipgre_policy,
1762         .priv_size      = sizeof(struct ip_tunnel),
1763         .setup          = ipgre_tunnel_setup,
1764         .validate       = ipgre_tunnel_validate,
1765         .newlink        = ipgre_newlink,
1766         .changelink     = ipgre_changelink,
1767         .get_size       = ipgre_get_size,
1768         .fill_info      = ipgre_fill_info,
1769 };
1770
1771 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1772         .kind           = "gretap",
1773         .maxtype        = IFLA_GRE_MAX,
1774         .policy         = ipgre_policy,
1775         .priv_size      = sizeof(struct ip_tunnel),
1776         .setup          = ipgre_tap_setup,
1777         .validate       = ipgre_tap_validate,
1778         .newlink        = ipgre_newlink,
1779         .changelink     = ipgre_changelink,
1780         .get_size       = ipgre_get_size,
1781         .fill_info      = ipgre_fill_info,
1782 };
1783
1784 /*
1785  *      And now the modules code and kernel interface.
1786  */
1787
1788 static int __init ipgre_init(void)
1789 {
1790         int err;
1791
1792         pr_info("GRE over IPv4 tunneling driver\n");
1793
1794         err = register_pernet_device(&ipgre_net_ops);
1795         if (err < 0)
1796                 return err;
1797
1798         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1799         if (err < 0) {
1800                 pr_info("%s: can't add protocol\n", __func__);
1801                 goto add_proto_failed;
1802         }
1803
1804         err = rtnl_link_register(&ipgre_link_ops);
1805         if (err < 0)
1806                 goto rtnl_link_failed;
1807
1808         err = rtnl_link_register(&ipgre_tap_ops);
1809         if (err < 0)
1810                 goto tap_ops_failed;
1811
1812 out:
1813         return err;
1814
1815 tap_ops_failed:
1816         rtnl_link_unregister(&ipgre_link_ops);
1817 rtnl_link_failed:
1818         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1819 add_proto_failed:
1820         unregister_pernet_device(&ipgre_net_ops);
1821         goto out;
1822 }
1823
1824 static void __exit ipgre_fini(void)
1825 {
1826         rtnl_link_unregister(&ipgre_tap_ops);
1827         rtnl_link_unregister(&ipgre_link_ops);
1828         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1829                 pr_info("%s: can't remove protocol\n", __func__);
1830         unregister_pernet_device(&ipgre_net_ops);
1831 }
1832
1833 module_init(ipgre_init);
1834 module_exit(ipgre_fini);
1835 MODULE_LICENSE("GPL");
1836 MODULE_ALIAS_RTNL_LINK("gre");
1837 MODULE_ALIAS_RTNL_LINK("gretap");
1838 MODULE_ALIAS_NETDEV("gre0");