]> rtime.felk.cvut.cz Git - mcf548x/linux.git/blob - net/ipv6/route.c
Initial 2.6.37
[mcf548x/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112 };
113
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117
118 static struct dst_ops ip6_dst_blackhole_ops = {
119         .family                 =       AF_INET6,
120         .protocol               =       cpu_to_be16(ETH_P_IPV6),
121         .destroy                =       ip6_dst_destroy,
122         .check                  =       ip6_dst_check,
123         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
124 };
125
126 static struct rt6_info ip6_null_entry_template = {
127         .dst = {
128                 .__refcnt       = ATOMIC_INIT(1),
129                 .__use          = 1,
130                 .obsolete       = -1,
131                 .error          = -ENETUNREACH,
132                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
133                 .input          = ip6_pkt_discard,
134                 .output         = ip6_pkt_discard_out,
135         },
136         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
137         .rt6i_protocol  = RTPROT_KERNEL,
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146
147 static struct rt6_info ip6_prohibit_entry_template = {
148         .dst = {
149                 .__refcnt       = ATOMIC_INIT(1),
150                 .__use          = 1,
151                 .obsolete       = -1,
152                 .error          = -EACCES,
153                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
154                 .input          = ip6_pkt_prohibit,
155                 .output         = ip6_pkt_prohibit_out,
156         },
157         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
158         .rt6i_protocol  = RTPROT_KERNEL,
159         .rt6i_metric    = ~(u32) 0,
160         .rt6i_ref       = ATOMIC_INIT(1),
161 };
162
163 static struct rt6_info ip6_blk_hole_entry_template = {
164         .dst = {
165                 .__refcnt       = ATOMIC_INIT(1),
166                 .__use          = 1,
167                 .obsolete       = -1,
168                 .error          = -EINVAL,
169                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
170                 .input          = dst_discard,
171                 .output         = dst_discard,
172         },
173         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
174         .rt6i_protocol  = RTPROT_KERNEL,
175         .rt6i_metric    = ~(u32) 0,
176         .rt6i_ref       = ATOMIC_INIT(1),
177 };
178
179 #endif
180
181 /* allocate dst with ip6_dst_ops */
182 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
183 {
184         return (struct rt6_info *)dst_alloc(ops);
185 }
186
187 static void ip6_dst_destroy(struct dst_entry *dst)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190         struct inet6_dev *idev = rt->rt6i_idev;
191
192         if (idev != NULL) {
193                 rt->rt6i_idev = NULL;
194                 in6_dev_put(idev);
195         }
196 }
197
198 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
199                            int how)
200 {
201         struct rt6_info *rt = (struct rt6_info *)dst;
202         struct inet6_dev *idev = rt->rt6i_idev;
203         struct net_device *loopback_dev =
204                 dev_net(dev)->loopback_dev;
205
206         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
207                 struct inet6_dev *loopback_idev =
208                         in6_dev_get(loopback_dev);
209                 if (loopback_idev != NULL) {
210                         rt->rt6i_idev = loopback_idev;
211                         in6_dev_put(idev);
212                 }
213         }
214 }
215
216 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
217 {
218         return (rt->rt6i_flags & RTF_EXPIRES) &&
219                 time_after(jiffies, rt->rt6i_expires);
220 }
221
222 static inline int rt6_need_strict(struct in6_addr *daddr)
223 {
224         return ipv6_addr_type(daddr) &
225                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
226 }
227
228 /*
229  *      Route lookup. Any table->tb6_lock is implied.
230  */
231
232 static inline struct rt6_info *rt6_device_match(struct net *net,
233                                                     struct rt6_info *rt,
234                                                     struct in6_addr *saddr,
235                                                     int oif,
236                                                     int flags)
237 {
238         struct rt6_info *local = NULL;
239         struct rt6_info *sprt;
240
241         if (!oif && ipv6_addr_any(saddr))
242                 goto out;
243
244         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
245                 struct net_device *dev = sprt->rt6i_dev;
246
247                 if (oif) {
248                         if (dev->ifindex == oif)
249                                 return sprt;
250                         if (dev->flags & IFF_LOOPBACK) {
251                                 if (sprt->rt6i_idev == NULL ||
252                                     sprt->rt6i_idev->dev->ifindex != oif) {
253                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
254                                                 continue;
255                                         if (local && (!oif ||
256                                                       local->rt6i_idev->dev->ifindex == oif))
257                                                 continue;
258                                 }
259                                 local = sprt;
260                         }
261                 } else {
262                         if (ipv6_chk_addr(net, saddr, dev,
263                                           flags & RT6_LOOKUP_F_IFACE))
264                                 return sprt;
265                 }
266         }
267
268         if (oif) {
269                 if (local)
270                         return local;
271
272                 if (flags & RT6_LOOKUP_F_IFACE)
273                         return net->ipv6.ip6_null_entry;
274         }
275 out:
276         return rt;
277 }
278
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283         /*
284          * Okay, this does not seem to be appropriate
285          * for now, however, we need to check if it
286          * is really so; aka Router Reachability Probing.
287          *
288          * Router Reachability Probe MUST be rate-limited
289          * to no more than one per minute.
290          */
291         if (!neigh || (neigh->nud_state & NUD_VALID))
292                 return;
293         read_lock_bh(&neigh->lock);
294         if (!(neigh->nud_state & NUD_VALID) &&
295             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296                 struct in6_addr mcaddr;
297                 struct in6_addr *target;
298
299                 neigh->updated = jiffies;
300                 read_unlock_bh(&neigh->lock);
301
302                 target = (struct in6_addr *)&neigh->primary_key;
303                 addrconf_addr_solict_mult(target, &mcaddr);
304                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305         } else
306                 read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311 }
312 #endif
313
314 /*
315  * Default Router Selection (RFC 2461 6.3.6)
316  */
317 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
318 {
319         struct net_device *dev = rt->rt6i_dev;
320         if (!oif || dev->ifindex == oif)
321                 return 2;
322         if ((dev->flags & IFF_LOOPBACK) &&
323             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
324                 return 1;
325         return 0;
326 }
327
328 static inline int rt6_check_neigh(struct rt6_info *rt)
329 {
330         struct neighbour *neigh = rt->rt6i_nexthop;
331         int m;
332         if (rt->rt6i_flags & RTF_NONEXTHOP ||
333             !(rt->rt6i_flags & RTF_GATEWAY))
334                 m = 1;
335         else if (neigh) {
336                 read_lock_bh(&neigh->lock);
337                 if (neigh->nud_state & NUD_VALID)
338                         m = 2;
339 #ifdef CONFIG_IPV6_ROUTER_PREF
340                 else if (neigh->nud_state & NUD_FAILED)
341                         m = 0;
342 #endif
343                 else
344                         m = 1;
345                 read_unlock_bh(&neigh->lock);
346         } else
347                 m = 0;
348         return m;
349 }
350
351 static int rt6_score_route(struct rt6_info *rt, int oif,
352                            int strict)
353 {
354         int m, n;
355
356         m = rt6_check_dev(rt, oif);
357         if (!m && (strict & RT6_LOOKUP_F_IFACE))
358                 return -1;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
361 #endif
362         n = rt6_check_neigh(rt);
363         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
364                 return -1;
365         return m;
366 }
367
368 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
369                                    int *mpri, struct rt6_info *match)
370 {
371         int m;
372
373         if (rt6_check_expired(rt))
374                 goto out;
375
376         m = rt6_score_route(rt, oif, strict);
377         if (m < 0)
378                 goto out;
379
380         if (m > *mpri) {
381                 if (strict & RT6_LOOKUP_F_REACHABLE)
382                         rt6_probe(match);
383                 *mpri = m;
384                 match = rt;
385         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
386                 rt6_probe(rt);
387         }
388
389 out:
390         return match;
391 }
392
393 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
394                                      struct rt6_info *rr_head,
395                                      u32 metric, int oif, int strict)
396 {
397         struct rt6_info *rt, *match;
398         int mpri = -1;
399
400         match = NULL;
401         for (rt = rr_head; rt && rt->rt6i_metric == metric;
402              rt = rt->dst.rt6_next)
403                 match = find_match(rt, oif, strict, &mpri, match);
404         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
405              rt = rt->dst.rt6_next)
406                 match = find_match(rt, oif, strict, &mpri, match);
407
408         return match;
409 }
410
411 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
412 {
413         struct rt6_info *match, *rt0;
414         struct net *net;
415
416         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
417                   __func__, fn->leaf, oif);
418
419         rt0 = fn->rr_ptr;
420         if (!rt0)
421                 fn->rr_ptr = rt0 = fn->leaf;
422
423         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
424
425         if (!match &&
426             (strict & RT6_LOOKUP_F_REACHABLE)) {
427                 struct rt6_info *next = rt0->dst.rt6_next;
428
429                 /* no entries matched; do round-robin */
430                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
431                         next = fn->leaf;
432
433                 if (next != rt0)
434                         fn->rr_ptr = next;
435         }
436
437         RT6_TRACE("%s() => %p\n",
438                   __func__, match);
439
440         net = dev_net(rt0->rt6i_dev);
441         return match ? match : net->ipv6.ip6_null_entry;
442 }
443
444 #ifdef CONFIG_IPV6_ROUTE_INFO
445 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
446                   struct in6_addr *gwaddr)
447 {
448         struct net *net = dev_net(dev);
449         struct route_info *rinfo = (struct route_info *) opt;
450         struct in6_addr prefix_buf, *prefix;
451         unsigned int pref;
452         unsigned long lifetime;
453         struct rt6_info *rt;
454
455         if (len < sizeof(struct route_info)) {
456                 return -EINVAL;
457         }
458
459         /* Sanity check for prefix_len and length */
460         if (rinfo->length > 3) {
461                 return -EINVAL;
462         } else if (rinfo->prefix_len > 128) {
463                 return -EINVAL;
464         } else if (rinfo->prefix_len > 64) {
465                 if (rinfo->length < 2) {
466                         return -EINVAL;
467                 }
468         } else if (rinfo->prefix_len > 0) {
469                 if (rinfo->length < 1) {
470                         return -EINVAL;
471                 }
472         }
473
474         pref = rinfo->route_pref;
475         if (pref == ICMPV6_ROUTER_PREF_INVALID)
476                 return -EINVAL;
477
478         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
479
480         if (rinfo->length == 3)
481                 prefix = (struct in6_addr *)rinfo->prefix;
482         else {
483                 /* this function is safe */
484                 ipv6_addr_prefix(&prefix_buf,
485                                  (struct in6_addr *)rinfo->prefix,
486                                  rinfo->prefix_len);
487                 prefix = &prefix_buf;
488         }
489
490         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
491                                 dev->ifindex);
492
493         if (rt && !lifetime) {
494                 ip6_del_rt(rt);
495                 rt = NULL;
496         }
497
498         if (!rt && lifetime)
499                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
500                                         pref);
501         else if (rt)
502                 rt->rt6i_flags = RTF_ROUTEINFO |
503                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
504
505         if (rt) {
506                 if (!addrconf_finite_timeout(lifetime)) {
507                         rt->rt6i_flags &= ~RTF_EXPIRES;
508                 } else {
509                         rt->rt6i_expires = jiffies + HZ * lifetime;
510                         rt->rt6i_flags |= RTF_EXPIRES;
511                 }
512                 dst_release(&rt->dst);
513         }
514         return 0;
515 }
516 #endif
517
518 #define BACKTRACK(__net, saddr)                 \
519 do { \
520         if (rt == __net->ipv6.ip6_null_entry) { \
521                 struct fib6_node *pn; \
522                 while (1) { \
523                         if (fn->fn_flags & RTN_TL_ROOT) \
524                                 goto out; \
525                         pn = fn->parent; \
526                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
527                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
528                         else \
529                                 fn = pn; \
530                         if (fn->fn_flags & RTN_RTINFO) \
531                                 goto restart; \
532                 } \
533         } \
534 } while(0)
535
536 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
537                                              struct fib6_table *table,
538                                              struct flowi *fl, int flags)
539 {
540         struct fib6_node *fn;
541         struct rt6_info *rt;
542
543         read_lock_bh(&table->tb6_lock);
544         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
545 restart:
546         rt = fn->leaf;
547         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
548         BACKTRACK(net, &fl->fl6_src);
549 out:
550         dst_use(&rt->dst, jiffies);
551         read_unlock_bh(&table->tb6_lock);
552         return rt;
553
554 }
555
556 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
557                             const struct in6_addr *saddr, int oif, int strict)
558 {
559         struct flowi fl = {
560                 .oif = oif,
561                 .nl_u = {
562                         .ip6_u = {
563                                 .daddr = *daddr,
564                         },
565                 },
566         };
567         struct dst_entry *dst;
568         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
569
570         if (saddr) {
571                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
572                 flags |= RT6_LOOKUP_F_HAS_SADDR;
573         }
574
575         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
576         if (dst->error == 0)
577                 return (struct rt6_info *) dst;
578
579         dst_release(dst);
580
581         return NULL;
582 }
583
584 EXPORT_SYMBOL(rt6_lookup);
585
586 /* ip6_ins_rt is called with FREE table->tb6_lock.
587    It takes new route entry, the addition fails by any reason the
588    route is freed. In any case, if caller does not hold it, it may
589    be destroyed.
590  */
591
592 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
593 {
594         int err;
595         struct fib6_table *table;
596
597         table = rt->rt6i_table;
598         write_lock_bh(&table->tb6_lock);
599         err = fib6_add(&table->tb6_root, rt, info);
600         write_unlock_bh(&table->tb6_lock);
601
602         return err;
603 }
604
605 int ip6_ins_rt(struct rt6_info *rt)
606 {
607         struct nl_info info = {
608                 .nl_net = dev_net(rt->rt6i_dev),
609         };
610         return __ip6_ins_rt(rt, &info);
611 }
612
613 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
614                                       struct in6_addr *saddr)
615 {
616         struct rt6_info *rt;
617
618         /*
619          *      Clone the route.
620          */
621
622         rt = ip6_rt_copy(ort);
623
624         if (rt) {
625                 struct neighbour *neigh;
626                 int attempts = !in_softirq();
627
628                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
629                         if (rt->rt6i_dst.plen != 128 &&
630                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
631                                 rt->rt6i_flags |= RTF_ANYCAST;
632                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
633                 }
634
635                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
636                 rt->rt6i_dst.plen = 128;
637                 rt->rt6i_flags |= RTF_CACHE;
638                 rt->dst.flags |= DST_HOST;
639
640 #ifdef CONFIG_IPV6_SUBTREES
641                 if (rt->rt6i_src.plen && saddr) {
642                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
643                         rt->rt6i_src.plen = 128;
644                 }
645 #endif
646
647         retry:
648                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
649                 if (IS_ERR(neigh)) {
650                         struct net *net = dev_net(rt->rt6i_dev);
651                         int saved_rt_min_interval =
652                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
653                         int saved_rt_elasticity =
654                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
655
656                         if (attempts-- > 0) {
657                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
658                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
659
660                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
661
662                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
663                                         saved_rt_elasticity;
664                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
665                                         saved_rt_min_interval;
666                                 goto retry;
667                         }
668
669                         if (net_ratelimit())
670                                 printk(KERN_WARNING
671                                        "ipv6: Neighbour table overflow.\n");
672                         dst_free(&rt->dst);
673                         return NULL;
674                 }
675                 rt->rt6i_nexthop = neigh;
676
677         }
678
679         return rt;
680 }
681
682 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
683 {
684         struct rt6_info *rt = ip6_rt_copy(ort);
685         if (rt) {
686                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
687                 rt->rt6i_dst.plen = 128;
688                 rt->rt6i_flags |= RTF_CACHE;
689                 rt->dst.flags |= DST_HOST;
690                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
691         }
692         return rt;
693 }
694
695 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
696                                       struct flowi *fl, int flags)
697 {
698         struct fib6_node *fn;
699         struct rt6_info *rt, *nrt;
700         int strict = 0;
701         int attempts = 3;
702         int err;
703         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
704
705         strict |= flags & RT6_LOOKUP_F_IFACE;
706
707 relookup:
708         read_lock_bh(&table->tb6_lock);
709
710 restart_2:
711         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
712
713 restart:
714         rt = rt6_select(fn, oif, strict | reachable);
715
716         BACKTRACK(net, &fl->fl6_src);
717         if (rt == net->ipv6.ip6_null_entry ||
718             rt->rt6i_flags & RTF_CACHE)
719                 goto out;
720
721         dst_hold(&rt->dst);
722         read_unlock_bh(&table->tb6_lock);
723
724         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
725                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
726         else {
727 #if CLONE_OFFLINK_ROUTE
728                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
729 #else
730                 goto out2;
731 #endif
732         }
733
734         dst_release(&rt->dst);
735         rt = nrt ? : net->ipv6.ip6_null_entry;
736
737         dst_hold(&rt->dst);
738         if (nrt) {
739                 err = ip6_ins_rt(nrt);
740                 if (!err)
741                         goto out2;
742         }
743
744         if (--attempts <= 0)
745                 goto out2;
746
747         /*
748          * Race condition! In the gap, when table->tb6_lock was
749          * released someone could insert this route.  Relookup.
750          */
751         dst_release(&rt->dst);
752         goto relookup;
753
754 out:
755         if (reachable) {
756                 reachable = 0;
757                 goto restart_2;
758         }
759         dst_hold(&rt->dst);
760         read_unlock_bh(&table->tb6_lock);
761 out2:
762         rt->dst.lastuse = jiffies;
763         rt->dst.__use++;
764
765         return rt;
766 }
767
768 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
769                                             struct flowi *fl, int flags)
770 {
771         return ip6_pol_route(net, table, fl->iif, fl, flags);
772 }
773
774 void ip6_route_input(struct sk_buff *skb)
775 {
776         struct ipv6hdr *iph = ipv6_hdr(skb);
777         struct net *net = dev_net(skb->dev);
778         int flags = RT6_LOOKUP_F_HAS_SADDR;
779         struct flowi fl = {
780                 .iif = skb->dev->ifindex,
781                 .nl_u = {
782                         .ip6_u = {
783                                 .daddr = iph->daddr,
784                                 .saddr = iph->saddr,
785                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
786                         },
787                 },
788                 .mark = skb->mark,
789                 .proto = iph->nexthdr,
790         };
791
792         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
793                 flags |= RT6_LOOKUP_F_IFACE;
794
795         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
796 }
797
798 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
799                                              struct flowi *fl, int flags)
800 {
801         return ip6_pol_route(net, table, fl->oif, fl, flags);
802 }
803
804 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
805                                     struct flowi *fl)
806 {
807         int flags = 0;
808
809         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
810                 flags |= RT6_LOOKUP_F_IFACE;
811
812         if (!ipv6_addr_any(&fl->fl6_src))
813                 flags |= RT6_LOOKUP_F_HAS_SADDR;
814         else if (sk)
815                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
816
817         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
818 }
819
820 EXPORT_SYMBOL(ip6_route_output);
821
822 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
823 {
824         struct rt6_info *ort = (struct rt6_info *) *dstp;
825         struct rt6_info *rt = (struct rt6_info *)
826                 dst_alloc(&ip6_dst_blackhole_ops);
827         struct dst_entry *new = NULL;
828
829         if (rt) {
830                 new = &rt->dst;
831
832                 atomic_set(&new->__refcnt, 1);
833                 new->__use = 1;
834                 new->input = dst_discard;
835                 new->output = dst_discard;
836
837                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
838                 new->dev = ort->dst.dev;
839                 if (new->dev)
840                         dev_hold(new->dev);
841                 rt->rt6i_idev = ort->rt6i_idev;
842                 if (rt->rt6i_idev)
843                         in6_dev_hold(rt->rt6i_idev);
844                 rt->rt6i_expires = 0;
845
846                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
847                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
848                 rt->rt6i_metric = 0;
849
850                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
851 #ifdef CONFIG_IPV6_SUBTREES
852                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
853 #endif
854
855                 dst_free(new);
856         }
857
858         dst_release(*dstp);
859         *dstp = new;
860         return new ? 0 : -ENOMEM;
861 }
862 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
863
864 /*
865  *      Destination cache support functions
866  */
867
868 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
869 {
870         struct rt6_info *rt;
871
872         rt = (struct rt6_info *) dst;
873
874         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
875                 return dst;
876
877         return NULL;
878 }
879
880 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
881 {
882         struct rt6_info *rt = (struct rt6_info *) dst;
883
884         if (rt) {
885                 if (rt->rt6i_flags & RTF_CACHE) {
886                         if (rt6_check_expired(rt)) {
887                                 ip6_del_rt(rt);
888                                 dst = NULL;
889                         }
890                 } else {
891                         dst_release(dst);
892                         dst = NULL;
893                 }
894         }
895         return dst;
896 }
897
898 static void ip6_link_failure(struct sk_buff *skb)
899 {
900         struct rt6_info *rt;
901
902         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
903
904         rt = (struct rt6_info *) skb_dst(skb);
905         if (rt) {
906                 if (rt->rt6i_flags&RTF_CACHE) {
907                         dst_set_expires(&rt->dst, 0);
908                         rt->rt6i_flags |= RTF_EXPIRES;
909                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
910                         rt->rt6i_node->fn_sernum = -1;
911         }
912 }
913
914 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
915 {
916         struct rt6_info *rt6 = (struct rt6_info*)dst;
917
918         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
919                 rt6->rt6i_flags |= RTF_MODIFIED;
920                 if (mtu < IPV6_MIN_MTU) {
921                         mtu = IPV6_MIN_MTU;
922                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
923                 }
924                 dst->metrics[RTAX_MTU-1] = mtu;
925                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
926         }
927 }
928
929 static int ipv6_get_mtu(struct net_device *dev);
930
931 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
932 {
933         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
934
935         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
936                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
937
938         /*
939          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
940          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
941          * IPV6_MAXPLEN is also valid and means: "any MSS,
942          * rely only on pmtu discovery"
943          */
944         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
945                 mtu = IPV6_MAXPLEN;
946         return mtu;
947 }
948
949 static struct dst_entry *icmp6_dst_gc_list;
950 static DEFINE_SPINLOCK(icmp6_dst_lock);
951
952 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
953                                   struct neighbour *neigh,
954                                   const struct in6_addr *addr)
955 {
956         struct rt6_info *rt;
957         struct inet6_dev *idev = in6_dev_get(dev);
958         struct net *net = dev_net(dev);
959
960         if (unlikely(idev == NULL))
961                 return NULL;
962
963         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
964         if (unlikely(rt == NULL)) {
965                 in6_dev_put(idev);
966                 goto out;
967         }
968
969         dev_hold(dev);
970         if (neigh)
971                 neigh_hold(neigh);
972         else {
973                 neigh = ndisc_get_neigh(dev, addr);
974                 if (IS_ERR(neigh))
975                         neigh = NULL;
976         }
977
978         rt->rt6i_dev      = dev;
979         rt->rt6i_idev     = idev;
980         rt->rt6i_nexthop  = neigh;
981         atomic_set(&rt->dst.__refcnt, 1);
982         rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
983         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
984         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
985         rt->dst.output  = ip6_output;
986
987 #if 0   /* there's no chance to use these for ndisc */
988         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
989                                 ? DST_HOST
990                                 : 0;
991         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
992         rt->rt6i_dst.plen = 128;
993 #endif
994
995         spin_lock_bh(&icmp6_dst_lock);
996         rt->dst.next = icmp6_dst_gc_list;
997         icmp6_dst_gc_list = &rt->dst;
998         spin_unlock_bh(&icmp6_dst_lock);
999
1000         fib6_force_start_gc(net);
1001
1002 out:
1003         return &rt->dst;
1004 }
1005
1006 int icmp6_dst_gc(void)
1007 {
1008         struct dst_entry *dst, *next, **pprev;
1009         int more = 0;
1010
1011         next = NULL;
1012
1013         spin_lock_bh(&icmp6_dst_lock);
1014         pprev = &icmp6_dst_gc_list;
1015
1016         while ((dst = *pprev) != NULL) {
1017                 if (!atomic_read(&dst->__refcnt)) {
1018                         *pprev = dst->next;
1019                         dst_free(dst);
1020                 } else {
1021                         pprev = &dst->next;
1022                         ++more;
1023                 }
1024         }
1025
1026         spin_unlock_bh(&icmp6_dst_lock);
1027
1028         return more;
1029 }
1030
1031 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1032                             void *arg)
1033 {
1034         struct dst_entry *dst, **pprev;
1035
1036         spin_lock_bh(&icmp6_dst_lock);
1037         pprev = &icmp6_dst_gc_list;
1038         while ((dst = *pprev) != NULL) {
1039                 struct rt6_info *rt = (struct rt6_info *) dst;
1040                 if (func(rt, arg)) {
1041                         *pprev = dst->next;
1042                         dst_free(dst);
1043                 } else {
1044                         pprev = &dst->next;
1045                 }
1046         }
1047         spin_unlock_bh(&icmp6_dst_lock);
1048 }
1049
1050 static int ip6_dst_gc(struct dst_ops *ops)
1051 {
1052         unsigned long now = jiffies;
1053         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1054         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1055         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1056         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1057         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1058         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1059         int entries;
1060
1061         entries = dst_entries_get_fast(ops);
1062         if (time_after(rt_last_gc + rt_min_interval, now) &&
1063             entries <= rt_max_size)
1064                 goto out;
1065
1066         net->ipv6.ip6_rt_gc_expire++;
1067         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068         net->ipv6.ip6_rt_last_gc = now;
1069         entries = dst_entries_get_slow(ops);
1070         if (entries < ops->gc_thresh)
1071                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1072 out:
1073         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1074         return entries > rt_max_size;
1075 }
1076
1077 /* Clean host part of a prefix. Not necessary in radix tree,
1078    but results in cleaner routing tables.
1079
1080    Remove it only when all the things will work!
1081  */
1082
1083 static int ipv6_get_mtu(struct net_device *dev)
1084 {
1085         int mtu = IPV6_MIN_MTU;
1086         struct inet6_dev *idev;
1087
1088         rcu_read_lock();
1089         idev = __in6_dev_get(dev);
1090         if (idev)
1091                 mtu = idev->cnf.mtu6;
1092         rcu_read_unlock();
1093         return mtu;
1094 }
1095
1096 int ip6_dst_hoplimit(struct dst_entry *dst)
1097 {
1098         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1099         if (hoplimit < 0) {
1100                 struct net_device *dev = dst->dev;
1101                 struct inet6_dev *idev;
1102
1103                 rcu_read_lock();
1104                 idev = __in6_dev_get(dev);
1105                 if (idev)
1106                         hoplimit = idev->cnf.hop_limit;
1107                 else
1108                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1109                 rcu_read_unlock();
1110         }
1111         return hoplimit;
1112 }
1113
1114 /*
1115  *
1116  */
1117
1118 int ip6_route_add(struct fib6_config *cfg)
1119 {
1120         int err;
1121         struct net *net = cfg->fc_nlinfo.nl_net;
1122         struct rt6_info *rt = NULL;
1123         struct net_device *dev = NULL;
1124         struct inet6_dev *idev = NULL;
1125         struct fib6_table *table;
1126         int addr_type;
1127
1128         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1129                 return -EINVAL;
1130 #ifndef CONFIG_IPV6_SUBTREES
1131         if (cfg->fc_src_len)
1132                 return -EINVAL;
1133 #endif
1134         if (cfg->fc_ifindex) {
1135                 err = -ENODEV;
1136                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1137                 if (!dev)
1138                         goto out;
1139                 idev = in6_dev_get(dev);
1140                 if (!idev)
1141                         goto out;
1142         }
1143
1144         if (cfg->fc_metric == 0)
1145                 cfg->fc_metric = IP6_RT_PRIO_USER;
1146
1147         table = fib6_new_table(net, cfg->fc_table);
1148         if (table == NULL) {
1149                 err = -ENOBUFS;
1150                 goto out;
1151         }
1152
1153         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1154
1155         if (rt == NULL) {
1156                 err = -ENOMEM;
1157                 goto out;
1158         }
1159
1160         rt->dst.obsolete = -1;
1161         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1162                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1163                                 0;
1164
1165         if (cfg->fc_protocol == RTPROT_UNSPEC)
1166                 cfg->fc_protocol = RTPROT_BOOT;
1167         rt->rt6i_protocol = cfg->fc_protocol;
1168
1169         addr_type = ipv6_addr_type(&cfg->fc_dst);
1170
1171         if (addr_type & IPV6_ADDR_MULTICAST)
1172                 rt->dst.input = ip6_mc_input;
1173         else if (cfg->fc_flags & RTF_LOCAL)
1174                 rt->dst.input = ip6_input;
1175         else
1176                 rt->dst.input = ip6_forward;
1177
1178         rt->dst.output = ip6_output;
1179
1180         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1181         rt->rt6i_dst.plen = cfg->fc_dst_len;
1182         if (rt->rt6i_dst.plen == 128)
1183                rt->dst.flags = DST_HOST;
1184
1185 #ifdef CONFIG_IPV6_SUBTREES
1186         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1187         rt->rt6i_src.plen = cfg->fc_src_len;
1188 #endif
1189
1190         rt->rt6i_metric = cfg->fc_metric;
1191
1192         /* We cannot add true routes via loopback here,
1193            they would result in kernel looping; promote them to reject routes
1194          */
1195         if ((cfg->fc_flags & RTF_REJECT) ||
1196             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1197                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1198                 /* hold loopback dev/idev if we haven't done so. */
1199                 if (dev != net->loopback_dev) {
1200                         if (dev) {
1201                                 dev_put(dev);
1202                                 in6_dev_put(idev);
1203                         }
1204                         dev = net->loopback_dev;
1205                         dev_hold(dev);
1206                         idev = in6_dev_get(dev);
1207                         if (!idev) {
1208                                 err = -ENODEV;
1209                                 goto out;
1210                         }
1211                 }
1212                 rt->dst.output = ip6_pkt_discard_out;
1213                 rt->dst.input = ip6_pkt_discard;
1214                 rt->dst.error = -ENETUNREACH;
1215                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1216                 goto install_route;
1217         }
1218
1219         if (cfg->fc_flags & RTF_GATEWAY) {
1220                 struct in6_addr *gw_addr;
1221                 int gwa_type;
1222
1223                 gw_addr = &cfg->fc_gateway;
1224                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1225                 gwa_type = ipv6_addr_type(gw_addr);
1226
1227                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1228                         struct rt6_info *grt;
1229
1230                         /* IPv6 strictly inhibits using not link-local
1231                            addresses as nexthop address.
1232                            Otherwise, router will not able to send redirects.
1233                            It is very good, but in some (rare!) circumstances
1234                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1235                            some exceptions. --ANK
1236                          */
1237                         err = -EINVAL;
1238                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1239                                 goto out;
1240
1241                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1242
1243                         err = -EHOSTUNREACH;
1244                         if (grt == NULL)
1245                                 goto out;
1246                         if (dev) {
1247                                 if (dev != grt->rt6i_dev) {
1248                                         dst_release(&grt->dst);
1249                                         goto out;
1250                                 }
1251                         } else {
1252                                 dev = grt->rt6i_dev;
1253                                 idev = grt->rt6i_idev;
1254                                 dev_hold(dev);
1255                                 in6_dev_hold(grt->rt6i_idev);
1256                         }
1257                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1258                                 err = 0;
1259                         dst_release(&grt->dst);
1260
1261                         if (err)
1262                                 goto out;
1263                 }
1264                 err = -EINVAL;
1265                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1266                         goto out;
1267         }
1268
1269         err = -ENODEV;
1270         if (dev == NULL)
1271                 goto out;
1272
1273         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1274                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1275                 if (IS_ERR(rt->rt6i_nexthop)) {
1276                         err = PTR_ERR(rt->rt6i_nexthop);
1277                         rt->rt6i_nexthop = NULL;
1278                         goto out;
1279                 }
1280         }
1281
1282         rt->rt6i_flags = cfg->fc_flags;
1283
1284 install_route:
1285         if (cfg->fc_mx) {
1286                 struct nlattr *nla;
1287                 int remaining;
1288
1289                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1290                         int type = nla_type(nla);
1291
1292                         if (type) {
1293                                 if (type > RTAX_MAX) {
1294                                         err = -EINVAL;
1295                                         goto out;
1296                                 }
1297
1298                                 rt->dst.metrics[type - 1] = nla_get_u32(nla);
1299                         }
1300                 }
1301         }
1302
1303         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1304                 rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1305         if (!dst_mtu(&rt->dst))
1306                 rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1307         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1308                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1309         rt->dst.dev = dev;
1310         rt->rt6i_idev = idev;
1311         rt->rt6i_table = table;
1312
1313         cfg->fc_nlinfo.nl_net = dev_net(dev);
1314
1315         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1316
1317 out:
1318         if (dev)
1319                 dev_put(dev);
1320         if (idev)
1321                 in6_dev_put(idev);
1322         if (rt)
1323                 dst_free(&rt->dst);
1324         return err;
1325 }
1326
1327 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1328 {
1329         int err;
1330         struct fib6_table *table;
1331         struct net *net = dev_net(rt->rt6i_dev);
1332
1333         if (rt == net->ipv6.ip6_null_entry)
1334                 return -ENOENT;
1335
1336         table = rt->rt6i_table;
1337         write_lock_bh(&table->tb6_lock);
1338
1339         err = fib6_del(rt, info);
1340         dst_release(&rt->dst);
1341
1342         write_unlock_bh(&table->tb6_lock);
1343
1344         return err;
1345 }
1346
1347 int ip6_del_rt(struct rt6_info *rt)
1348 {
1349         struct nl_info info = {
1350                 .nl_net = dev_net(rt->rt6i_dev),
1351         };
1352         return __ip6_del_rt(rt, &info);
1353 }
1354
1355 static int ip6_route_del(struct fib6_config *cfg)
1356 {
1357         struct fib6_table *table;
1358         struct fib6_node *fn;
1359         struct rt6_info *rt;
1360         int err = -ESRCH;
1361
1362         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1363         if (table == NULL)
1364                 return err;
1365
1366         read_lock_bh(&table->tb6_lock);
1367
1368         fn = fib6_locate(&table->tb6_root,
1369                          &cfg->fc_dst, cfg->fc_dst_len,
1370                          &cfg->fc_src, cfg->fc_src_len);
1371
1372         if (fn) {
1373                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1374                         if (cfg->fc_ifindex &&
1375                             (rt->rt6i_dev == NULL ||
1376                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1377                                 continue;
1378                         if (cfg->fc_flags & RTF_GATEWAY &&
1379                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1380                                 continue;
1381                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1382                                 continue;
1383                         dst_hold(&rt->dst);
1384                         read_unlock_bh(&table->tb6_lock);
1385
1386                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1387                 }
1388         }
1389         read_unlock_bh(&table->tb6_lock);
1390
1391         return err;
1392 }
1393
1394 /*
1395  *      Handle redirects
1396  */
1397 struct ip6rd_flowi {
1398         struct flowi fl;
1399         struct in6_addr gateway;
1400 };
1401
1402 static struct rt6_info *__ip6_route_redirect(struct net *net,
1403                                              struct fib6_table *table,
1404                                              struct flowi *fl,
1405                                              int flags)
1406 {
1407         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1408         struct rt6_info *rt;
1409         struct fib6_node *fn;
1410
1411         /*
1412          * Get the "current" route for this destination and
1413          * check if the redirect has come from approriate router.
1414          *
1415          * RFC 2461 specifies that redirects should only be
1416          * accepted if they come from the nexthop to the target.
1417          * Due to the way the routes are chosen, this notion
1418          * is a bit fuzzy and one might need to check all possible
1419          * routes.
1420          */
1421
1422         read_lock_bh(&table->tb6_lock);
1423         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1424 restart:
1425         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1426                 /*
1427                  * Current route is on-link; redirect is always invalid.
1428                  *
1429                  * Seems, previous statement is not true. It could
1430                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1431                  * But then router serving it might decide, that we should
1432                  * know truth 8)8) --ANK (980726).
1433                  */
1434                 if (rt6_check_expired(rt))
1435                         continue;
1436                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1437                         continue;
1438                 if (fl->oif != rt->rt6i_dev->ifindex)
1439                         continue;
1440                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1441                         continue;
1442                 break;
1443         }
1444
1445         if (!rt)
1446                 rt = net->ipv6.ip6_null_entry;
1447         BACKTRACK(net, &fl->fl6_src);
1448 out:
1449         dst_hold(&rt->dst);
1450
1451         read_unlock_bh(&table->tb6_lock);
1452
1453         return rt;
1454 };
1455
1456 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1457                                            struct in6_addr *src,
1458                                            struct in6_addr *gateway,
1459                                            struct net_device *dev)
1460 {
1461         int flags = RT6_LOOKUP_F_HAS_SADDR;
1462         struct net *net = dev_net(dev);
1463         struct ip6rd_flowi rdfl = {
1464                 .fl = {
1465                         .oif = dev->ifindex,
1466                         .nl_u = {
1467                                 .ip6_u = {
1468                                         .daddr = *dest,
1469                                         .saddr = *src,
1470                                 },
1471                         },
1472                 },
1473         };
1474
1475         ipv6_addr_copy(&rdfl.gateway, gateway);
1476
1477         if (rt6_need_strict(dest))
1478                 flags |= RT6_LOOKUP_F_IFACE;
1479
1480         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1481                                                    flags, __ip6_route_redirect);
1482 }
1483
1484 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1485                   struct in6_addr *saddr,
1486                   struct neighbour *neigh, u8 *lladdr, int on_link)
1487 {
1488         struct rt6_info *rt, *nrt = NULL;
1489         struct netevent_redirect netevent;
1490         struct net *net = dev_net(neigh->dev);
1491
1492         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1493
1494         if (rt == net->ipv6.ip6_null_entry) {
1495                 if (net_ratelimit())
1496                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1497                                "for redirect target\n");
1498                 goto out;
1499         }
1500
1501         /*
1502          *      We have finally decided to accept it.
1503          */
1504
1505         neigh_update(neigh, lladdr, NUD_STALE,
1506                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1507                      NEIGH_UPDATE_F_OVERRIDE|
1508                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1509                                      NEIGH_UPDATE_F_ISROUTER))
1510                      );
1511
1512         /*
1513          * Redirect received -> path was valid.
1514          * Look, redirects are sent only in response to data packets,
1515          * so that this nexthop apparently is reachable. --ANK
1516          */
1517         dst_confirm(&rt->dst);
1518
1519         /* Duplicate redirect: silently ignore. */
1520         if (neigh == rt->dst.neighbour)
1521                 goto out;
1522
1523         nrt = ip6_rt_copy(rt);
1524         if (nrt == NULL)
1525                 goto out;
1526
1527         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1528         if (on_link)
1529                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1530
1531         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1532         nrt->rt6i_dst.plen = 128;
1533         nrt->dst.flags |= DST_HOST;
1534
1535         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1536         nrt->rt6i_nexthop = neigh_clone(neigh);
1537         /* Reset pmtu, it may be better */
1538         nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1539         nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1540                                                         dst_mtu(&nrt->dst));
1541
1542         if (ip6_ins_rt(nrt))
1543                 goto out;
1544
1545         netevent.old = &rt->dst;
1546         netevent.new = &nrt->dst;
1547         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1548
1549         if (rt->rt6i_flags&RTF_CACHE) {
1550                 ip6_del_rt(rt);
1551                 return;
1552         }
1553
1554 out:
1555         dst_release(&rt->dst);
1556 }
1557
1558 /*
1559  *      Handle ICMP "packet too big" messages
1560  *      i.e. Path MTU discovery
1561  */
1562
1563 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1564                              struct net *net, u32 pmtu, int ifindex)
1565 {
1566         struct rt6_info *rt, *nrt;
1567         int allfrag = 0;
1568 again:
1569         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1570         if (rt == NULL)
1571                 return;
1572
1573         if (rt6_check_expired(rt)) {
1574                 ip6_del_rt(rt);
1575                 goto again;
1576         }
1577
1578         if (pmtu >= dst_mtu(&rt->dst))
1579                 goto out;
1580
1581         if (pmtu < IPV6_MIN_MTU) {
1582                 /*
1583                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1584                  * MTU (1280) and a fragment header should always be included
1585                  * after a node receiving Too Big message reporting PMTU is
1586                  * less than the IPv6 Minimum Link MTU.
1587                  */
1588                 pmtu = IPV6_MIN_MTU;
1589                 allfrag = 1;
1590         }
1591
1592         /* New mtu received -> path was valid.
1593            They are sent only in response to data packets,
1594            so that this nexthop apparently is reachable. --ANK
1595          */
1596         dst_confirm(&rt->dst);
1597
1598         /* Host route. If it is static, it would be better
1599            not to override it, but add new one, so that
1600            when cache entry will expire old pmtu
1601            would return automatically.
1602          */
1603         if (rt->rt6i_flags & RTF_CACHE) {
1604                 rt->dst.metrics[RTAX_MTU-1] = pmtu;
1605                 if (allfrag)
1606                         rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1607                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1608                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1609                 goto out;
1610         }
1611
1612         /* Network route.
1613            Two cases are possible:
1614            1. It is connected route. Action: COW
1615            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1616          */
1617         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1618                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1619         else
1620                 nrt = rt6_alloc_clone(rt, daddr);
1621
1622         if (nrt) {
1623                 nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1624                 if (allfrag)
1625                         nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1626
1627                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1628                  * happened within 5 mins, the recommended timer is 10 mins.
1629                  * Here this route expiration time is set to ip6_rt_mtu_expires
1630                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1631                  * and detecting PMTU increase will be automatically happened.
1632                  */
1633                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1634                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1635
1636                 ip6_ins_rt(nrt);
1637         }
1638 out:
1639         dst_release(&rt->dst);
1640 }
1641
1642 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1643                         struct net_device *dev, u32 pmtu)
1644 {
1645         struct net *net = dev_net(dev);
1646
1647         /*
1648          * RFC 1981 states that a node "MUST reduce the size of the packets it
1649          * is sending along the path" that caused the Packet Too Big message.
1650          * Since it's not possible in the general case to determine which
1651          * interface was used to send the original packet, we update the MTU
1652          * on the interface that will be used to send future packets. We also
1653          * update the MTU on the interface that received the Packet Too Big in
1654          * case the original packet was forced out that interface with
1655          * SO_BINDTODEVICE or similar. This is the next best thing to the
1656          * correct behaviour, which would be to update the MTU on all
1657          * interfaces.
1658          */
1659         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1660         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1661 }
1662
1663 /*
1664  *      Misc support functions
1665  */
1666
1667 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1668 {
1669         struct net *net = dev_net(ort->rt6i_dev);
1670         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1671
1672         if (rt) {
1673                 rt->dst.input = ort->dst.input;
1674                 rt->dst.output = ort->dst.output;
1675
1676                 memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1677                 rt->dst.error = ort->dst.error;
1678                 rt->dst.dev = ort->dst.dev;
1679                 if (rt->dst.dev)
1680                         dev_hold(rt->dst.dev);
1681                 rt->rt6i_idev = ort->rt6i_idev;
1682                 if (rt->rt6i_idev)
1683                         in6_dev_hold(rt->rt6i_idev);
1684                 rt->dst.lastuse = jiffies;
1685                 rt->rt6i_expires = 0;
1686
1687                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1688                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1689                 rt->rt6i_metric = 0;
1690
1691                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1692 #ifdef CONFIG_IPV6_SUBTREES
1693                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1694 #endif
1695                 rt->rt6i_table = ort->rt6i_table;
1696         }
1697         return rt;
1698 }
1699
1700 #ifdef CONFIG_IPV6_ROUTE_INFO
1701 static struct rt6_info *rt6_get_route_info(struct net *net,
1702                                            struct in6_addr *prefix, int prefixlen,
1703                                            struct in6_addr *gwaddr, int ifindex)
1704 {
1705         struct fib6_node *fn;
1706         struct rt6_info *rt = NULL;
1707         struct fib6_table *table;
1708
1709         table = fib6_get_table(net, RT6_TABLE_INFO);
1710         if (table == NULL)
1711                 return NULL;
1712
1713         write_lock_bh(&table->tb6_lock);
1714         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1715         if (!fn)
1716                 goto out;
1717
1718         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1719                 if (rt->rt6i_dev->ifindex != ifindex)
1720                         continue;
1721                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1722                         continue;
1723                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1724                         continue;
1725                 dst_hold(&rt->dst);
1726                 break;
1727         }
1728 out:
1729         write_unlock_bh(&table->tb6_lock);
1730         return rt;
1731 }
1732
1733 static struct rt6_info *rt6_add_route_info(struct net *net,
1734                                            struct in6_addr *prefix, int prefixlen,
1735                                            struct in6_addr *gwaddr, int ifindex,
1736                                            unsigned pref)
1737 {
1738         struct fib6_config cfg = {
1739                 .fc_table       = RT6_TABLE_INFO,
1740                 .fc_metric      = IP6_RT_PRIO_USER,
1741                 .fc_ifindex     = ifindex,
1742                 .fc_dst_len     = prefixlen,
1743                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1744                                   RTF_UP | RTF_PREF(pref),
1745                 .fc_nlinfo.pid = 0,
1746                 .fc_nlinfo.nlh = NULL,
1747                 .fc_nlinfo.nl_net = net,
1748         };
1749
1750         ipv6_addr_copy(&cfg.fc_dst, prefix);
1751         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1752
1753         /* We should treat it as a default route if prefix length is 0. */
1754         if (!prefixlen)
1755                 cfg.fc_flags |= RTF_DEFAULT;
1756
1757         ip6_route_add(&cfg);
1758
1759         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1760 }
1761 #endif
1762
1763 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1764 {
1765         struct rt6_info *rt;
1766         struct fib6_table *table;
1767
1768         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1769         if (table == NULL)
1770                 return NULL;
1771
1772         write_lock_bh(&table->tb6_lock);
1773         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1774                 if (dev == rt->rt6i_dev &&
1775                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1776                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1777                         break;
1778         }
1779         if (rt)
1780                 dst_hold(&rt->dst);
1781         write_unlock_bh(&table->tb6_lock);
1782         return rt;
1783 }
1784
1785 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1786                                      struct net_device *dev,
1787                                      unsigned int pref)
1788 {
1789         struct fib6_config cfg = {
1790                 .fc_table       = RT6_TABLE_DFLT,
1791                 .fc_metric      = IP6_RT_PRIO_USER,
1792                 .fc_ifindex     = dev->ifindex,
1793                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1794                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1795                 .fc_nlinfo.pid = 0,
1796                 .fc_nlinfo.nlh = NULL,
1797                 .fc_nlinfo.nl_net = dev_net(dev),
1798         };
1799
1800         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1801
1802         ip6_route_add(&cfg);
1803
1804         return rt6_get_dflt_router(gwaddr, dev);
1805 }
1806
1807 void rt6_purge_dflt_routers(struct net *net)
1808 {
1809         struct rt6_info *rt;
1810         struct fib6_table *table;
1811
1812         /* NOTE: Keep consistent with rt6_get_dflt_router */
1813         table = fib6_get_table(net, RT6_TABLE_DFLT);
1814         if (table == NULL)
1815                 return;
1816
1817 restart:
1818         read_lock_bh(&table->tb6_lock);
1819         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1820                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1821                         dst_hold(&rt->dst);
1822                         read_unlock_bh(&table->tb6_lock);
1823                         ip6_del_rt(rt);
1824                         goto restart;
1825                 }
1826         }
1827         read_unlock_bh(&table->tb6_lock);
1828 }
1829
1830 static void rtmsg_to_fib6_config(struct net *net,
1831                                  struct in6_rtmsg *rtmsg,
1832                                  struct fib6_config *cfg)
1833 {
1834         memset(cfg, 0, sizeof(*cfg));
1835
1836         cfg->fc_table = RT6_TABLE_MAIN;
1837         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1838         cfg->fc_metric = rtmsg->rtmsg_metric;
1839         cfg->fc_expires = rtmsg->rtmsg_info;
1840         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1841         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1842         cfg->fc_flags = rtmsg->rtmsg_flags;
1843
1844         cfg->fc_nlinfo.nl_net = net;
1845
1846         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1847         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1848         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1849 }
1850
1851 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1852 {
1853         struct fib6_config cfg;
1854         struct in6_rtmsg rtmsg;
1855         int err;
1856
1857         switch(cmd) {
1858         case SIOCADDRT:         /* Add a route */
1859         case SIOCDELRT:         /* Delete a route */
1860                 if (!capable(CAP_NET_ADMIN))
1861                         return -EPERM;
1862                 err = copy_from_user(&rtmsg, arg,
1863                                      sizeof(struct in6_rtmsg));
1864                 if (err)
1865                         return -EFAULT;
1866
1867                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1868
1869                 rtnl_lock();
1870                 switch (cmd) {
1871                 case SIOCADDRT:
1872                         err = ip6_route_add(&cfg);
1873                         break;
1874                 case SIOCDELRT:
1875                         err = ip6_route_del(&cfg);
1876                         break;
1877                 default:
1878                         err = -EINVAL;
1879                 }
1880                 rtnl_unlock();
1881
1882                 return err;
1883         }
1884
1885         return -EINVAL;
1886 }
1887
1888 /*
1889  *      Drop the packet on the floor
1890  */
1891
1892 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1893 {
1894         int type;
1895         struct dst_entry *dst = skb_dst(skb);
1896         switch (ipstats_mib_noroutes) {
1897         case IPSTATS_MIB_INNOROUTES:
1898                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1899                 if (type == IPV6_ADDR_ANY) {
1900                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1901                                       IPSTATS_MIB_INADDRERRORS);
1902                         break;
1903                 }
1904                 /* FALLTHROUGH */
1905         case IPSTATS_MIB_OUTNOROUTES:
1906                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1907                               ipstats_mib_noroutes);
1908                 break;
1909         }
1910         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1911         kfree_skb(skb);
1912         return 0;
1913 }
1914
1915 static int ip6_pkt_discard(struct sk_buff *skb)
1916 {
1917         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1918 }
1919
1920 static int ip6_pkt_discard_out(struct sk_buff *skb)
1921 {
1922         skb->dev = skb_dst(skb)->dev;
1923         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1924 }
1925
1926 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1927
1928 static int ip6_pkt_prohibit(struct sk_buff *skb)
1929 {
1930         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1931 }
1932
1933 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1934 {
1935         skb->dev = skb_dst(skb)->dev;
1936         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1937 }
1938
1939 #endif
1940
1941 /*
1942  *      Allocate a dst for local (unicast / anycast) address.
1943  */
1944
1945 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1946                                     const struct in6_addr *addr,
1947                                     int anycast)
1948 {
1949         struct net *net = dev_net(idev->dev);
1950         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1951         struct neighbour *neigh;
1952
1953         if (rt == NULL) {
1954                 if (net_ratelimit())
1955                         pr_warning("IPv6:  Maximum number of routes reached,"
1956                                    " consider increasing route/max_size.\n");
1957                 return ERR_PTR(-ENOMEM);
1958         }
1959
1960         dev_hold(net->loopback_dev);
1961         in6_dev_hold(idev);
1962
1963         rt->dst.flags = DST_HOST;
1964         rt->dst.input = ip6_input;
1965         rt->dst.output = ip6_output;
1966         rt->rt6i_dev = net->loopback_dev;
1967         rt->rt6i_idev = idev;
1968         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1969         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1970         rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1971         rt->dst.obsolete = -1;
1972
1973         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1974         if (anycast)
1975                 rt->rt6i_flags |= RTF_ANYCAST;
1976         else
1977                 rt->rt6i_flags |= RTF_LOCAL;
1978         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1979         if (IS_ERR(neigh)) {
1980                 dst_free(&rt->dst);
1981
1982                 /* We are casting this because that is the return
1983                  * value type.  But an errno encoded pointer is the
1984                  * same regardless of the underlying pointer type,
1985                  * and that's what we are returning.  So this is OK.
1986                  */
1987                 return (struct rt6_info *) neigh;
1988         }
1989         rt->rt6i_nexthop = neigh;
1990
1991         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1992         rt->rt6i_dst.plen = 128;
1993         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1994
1995         atomic_set(&rt->dst.__refcnt, 1);
1996
1997         return rt;
1998 }
1999
2000 struct arg_dev_net {
2001         struct net_device *dev;
2002         struct net *net;
2003 };
2004
2005 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2006 {
2007         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
2008         struct net *net = ((struct arg_dev_net *)arg)->net;
2009
2010         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2011             rt != net->ipv6.ip6_null_entry) {
2012                 RT6_TRACE("deleted by ifdown %p\n", rt);
2013                 return -1;
2014         }
2015         return 0;
2016 }
2017
2018 void rt6_ifdown(struct net *net, struct net_device *dev)
2019 {
2020         struct arg_dev_net adn = {
2021                 .dev = dev,
2022                 .net = net,
2023         };
2024
2025         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2026         icmp6_clean_all(fib6_ifdown, &adn);
2027 }
2028
2029 struct rt6_mtu_change_arg
2030 {
2031         struct net_device *dev;
2032         unsigned mtu;
2033 };
2034
2035 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2036 {
2037         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2038         struct inet6_dev *idev;
2039         struct net *net = dev_net(arg->dev);
2040
2041         /* In IPv6 pmtu discovery is not optional,
2042            so that RTAX_MTU lock cannot disable it.
2043            We still use this lock to block changes
2044            caused by addrconf/ndisc.
2045         */
2046
2047         idev = __in6_dev_get(arg->dev);
2048         if (idev == NULL)
2049                 return 0;
2050
2051         /* For administrative MTU increase, there is no way to discover
2052            IPv6 PMTU increase, so PMTU increase should be updated here.
2053            Since RFC 1981 doesn't include administrative MTU increase
2054            update PMTU increase is a MUST. (i.e. jumbo frame)
2055          */
2056         /*
2057            If new MTU is less than route PMTU, this new MTU will be the
2058            lowest MTU in the path, update the route PMTU to reflect PMTU
2059            decreases; if new MTU is greater than route PMTU, and the
2060            old MTU is the lowest MTU in the path, update the route PMTU
2061            to reflect the increase. In this case if the other nodes' MTU
2062            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2063            PMTU discouvery.
2064          */
2065         if (rt->rt6i_dev == arg->dev &&
2066             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2067             (dst_mtu(&rt->dst) >= arg->mtu ||
2068              (dst_mtu(&rt->dst) < arg->mtu &&
2069               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2070                 rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2071                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2072         }
2073         return 0;
2074 }
2075
2076 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2077 {
2078         struct rt6_mtu_change_arg arg = {
2079                 .dev = dev,
2080                 .mtu = mtu,
2081         };
2082
2083         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2084 }
2085
2086 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2087         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2088         [RTA_OIF]               = { .type = NLA_U32 },
2089         [RTA_IIF]               = { .type = NLA_U32 },
2090         [RTA_PRIORITY]          = { .type = NLA_U32 },
2091         [RTA_METRICS]           = { .type = NLA_NESTED },
2092 };
2093
2094 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2095                               struct fib6_config *cfg)
2096 {
2097         struct rtmsg *rtm;
2098         struct nlattr *tb[RTA_MAX+1];
2099         int err;
2100
2101         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2102         if (err < 0)
2103                 goto errout;
2104
2105         err = -EINVAL;
2106         rtm = nlmsg_data(nlh);
2107         memset(cfg, 0, sizeof(*cfg));
2108
2109         cfg->fc_table = rtm->rtm_table;
2110         cfg->fc_dst_len = rtm->rtm_dst_len;
2111         cfg->fc_src_len = rtm->rtm_src_len;
2112         cfg->fc_flags = RTF_UP;
2113         cfg->fc_protocol = rtm->rtm_protocol;
2114
2115         if (rtm->rtm_type == RTN_UNREACHABLE)
2116                 cfg->fc_flags |= RTF_REJECT;
2117
2118         if (rtm->rtm_type == RTN_LOCAL)
2119                 cfg->fc_flags |= RTF_LOCAL;
2120
2121         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2122         cfg->fc_nlinfo.nlh = nlh;
2123         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2124
2125         if (tb[RTA_GATEWAY]) {
2126                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2127                 cfg->fc_flags |= RTF_GATEWAY;
2128         }
2129
2130         if (tb[RTA_DST]) {
2131                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2132
2133                 if (nla_len(tb[RTA_DST]) < plen)
2134                         goto errout;
2135
2136                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2137         }
2138
2139         if (tb[RTA_SRC]) {
2140                 int plen = (rtm->rtm_src_len + 7) >> 3;
2141
2142                 if (nla_len(tb[RTA_SRC]) < plen)
2143                         goto errout;
2144
2145                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2146         }
2147
2148         if (tb[RTA_OIF])
2149                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2150
2151         if (tb[RTA_PRIORITY])
2152                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2153
2154         if (tb[RTA_METRICS]) {
2155                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2156                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2157         }
2158
2159         if (tb[RTA_TABLE])
2160                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2161
2162         err = 0;
2163 errout:
2164         return err;
2165 }
2166
2167 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2168 {
2169         struct fib6_config cfg;
2170         int err;
2171
2172         err = rtm_to_fib6_config(skb, nlh, &cfg);
2173         if (err < 0)
2174                 return err;
2175
2176         return ip6_route_del(&cfg);
2177 }
2178
2179 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2180 {
2181         struct fib6_config cfg;
2182         int err;
2183
2184         err = rtm_to_fib6_config(skb, nlh, &cfg);
2185         if (err < 0)
2186                 return err;
2187
2188         return ip6_route_add(&cfg);
2189 }
2190
2191 static inline size_t rt6_nlmsg_size(void)
2192 {
2193         return NLMSG_ALIGN(sizeof(struct rtmsg))
2194                + nla_total_size(16) /* RTA_SRC */
2195                + nla_total_size(16) /* RTA_DST */
2196                + nla_total_size(16) /* RTA_GATEWAY */
2197                + nla_total_size(16) /* RTA_PREFSRC */
2198                + nla_total_size(4) /* RTA_TABLE */
2199                + nla_total_size(4) /* RTA_IIF */
2200                + nla_total_size(4) /* RTA_OIF */
2201                + nla_total_size(4) /* RTA_PRIORITY */
2202                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2203                + nla_total_size(sizeof(struct rta_cacheinfo));
2204 }
2205
2206 static int rt6_fill_node(struct net *net,
2207                          struct sk_buff *skb, struct rt6_info *rt,
2208                          struct in6_addr *dst, struct in6_addr *src,
2209                          int iif, int type, u32 pid, u32 seq,
2210                          int prefix, int nowait, unsigned int flags)
2211 {
2212         struct rtmsg *rtm;
2213         struct nlmsghdr *nlh;
2214         long expires;
2215         u32 table;
2216
2217         if (prefix) {   /* user wants prefix routes only */
2218                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2219                         /* success since this is not a prefix route */
2220                         return 1;
2221                 }
2222         }
2223
2224         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2225         if (nlh == NULL)
2226                 return -EMSGSIZE;
2227
2228         rtm = nlmsg_data(nlh);
2229         rtm->rtm_family = AF_INET6;
2230         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2231         rtm->rtm_src_len = rt->rt6i_src.plen;
2232         rtm->rtm_tos = 0;
2233         if (rt->rt6i_table)
2234                 table = rt->rt6i_table->tb6_id;
2235         else
2236                 table = RT6_TABLE_UNSPEC;
2237         rtm->rtm_table = table;
2238         NLA_PUT_U32(skb, RTA_TABLE, table);
2239         if (rt->rt6i_flags&RTF_REJECT)
2240                 rtm->rtm_type = RTN_UNREACHABLE;
2241         else if (rt->rt6i_flags&RTF_LOCAL)
2242                 rtm->rtm_type = RTN_LOCAL;
2243         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2244                 rtm->rtm_type = RTN_LOCAL;
2245         else
2246                 rtm->rtm_type = RTN_UNICAST;
2247         rtm->rtm_flags = 0;
2248         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2249         rtm->rtm_protocol = rt->rt6i_protocol;
2250         if (rt->rt6i_flags&RTF_DYNAMIC)
2251                 rtm->rtm_protocol = RTPROT_REDIRECT;
2252         else if (rt->rt6i_flags & RTF_ADDRCONF)
2253                 rtm->rtm_protocol = RTPROT_KERNEL;
2254         else if (rt->rt6i_flags&RTF_DEFAULT)
2255                 rtm->rtm_protocol = RTPROT_RA;
2256
2257         if (rt->rt6i_flags&RTF_CACHE)
2258                 rtm->rtm_flags |= RTM_F_CLONED;
2259
2260         if (dst) {
2261                 NLA_PUT(skb, RTA_DST, 16, dst);
2262                 rtm->rtm_dst_len = 128;
2263         } else if (rtm->rtm_dst_len)
2264                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2265 #ifdef CONFIG_IPV6_SUBTREES
2266         if (src) {
2267                 NLA_PUT(skb, RTA_SRC, 16, src);
2268                 rtm->rtm_src_len = 128;
2269         } else if (rtm->rtm_src_len)
2270                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2271 #endif
2272         if (iif) {
2273 #ifdef CONFIG_IPV6_MROUTE
2274                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2275                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2276                         if (err <= 0) {
2277                                 if (!nowait) {
2278                                         if (err == 0)
2279                                                 return 0;
2280                                         goto nla_put_failure;
2281                                 } else {
2282                                         if (err == -EMSGSIZE)
2283                                                 goto nla_put_failure;
2284                                 }
2285                         }
2286                 } else
2287 #endif
2288                         NLA_PUT_U32(skb, RTA_IIF, iif);
2289         } else if (dst) {
2290                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2291                 struct in6_addr saddr_buf;
2292                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2293                                        dst, 0, &saddr_buf) == 0)
2294                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2295         }
2296
2297         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2298                 goto nla_put_failure;
2299
2300         if (rt->dst.neighbour)
2301                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2302
2303         if (rt->dst.dev)
2304                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2305
2306         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2307
2308         if (!(rt->rt6i_flags & RTF_EXPIRES))
2309                 expires = 0;
2310         else if (rt->rt6i_expires - jiffies < INT_MAX)
2311                 expires = rt->rt6i_expires - jiffies;
2312         else
2313                 expires = INT_MAX;
2314
2315         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2316                                expires, rt->dst.error) < 0)
2317                 goto nla_put_failure;
2318
2319         return nlmsg_end(skb, nlh);
2320
2321 nla_put_failure:
2322         nlmsg_cancel(skb, nlh);
2323         return -EMSGSIZE;
2324 }
2325
2326 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2327 {
2328         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2329         int prefix;
2330
2331         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2332                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2333                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2334         } else
2335                 prefix = 0;
2336
2337         return rt6_fill_node(arg->net,
2338                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2339                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2340                      prefix, 0, NLM_F_MULTI);
2341 }
2342
2343 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2344 {
2345         struct net *net = sock_net(in_skb->sk);
2346         struct nlattr *tb[RTA_MAX+1];
2347         struct rt6_info *rt;
2348         struct sk_buff *skb;
2349         struct rtmsg *rtm;
2350         struct flowi fl;
2351         int err, iif = 0;
2352
2353         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2354         if (err < 0)
2355                 goto errout;
2356
2357         err = -EINVAL;
2358         memset(&fl, 0, sizeof(fl));
2359
2360         if (tb[RTA_SRC]) {
2361                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2362                         goto errout;
2363
2364                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2365         }
2366
2367         if (tb[RTA_DST]) {
2368                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2369                         goto errout;
2370
2371                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2372         }
2373
2374         if (tb[RTA_IIF])
2375                 iif = nla_get_u32(tb[RTA_IIF]);
2376
2377         if (tb[RTA_OIF])
2378                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2379
2380         if (iif) {
2381                 struct net_device *dev;
2382                 dev = __dev_get_by_index(net, iif);
2383                 if (!dev) {
2384                         err = -ENODEV;
2385                         goto errout;
2386                 }
2387         }
2388
2389         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2390         if (skb == NULL) {
2391                 err = -ENOBUFS;
2392                 goto errout;
2393         }
2394
2395         /* Reserve room for dummy headers, this skb can pass
2396            through good chunk of routing engine.
2397          */
2398         skb_reset_mac_header(skb);
2399         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2400
2401         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2402         skb_dst_set(skb, &rt->dst);
2403
2404         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2405                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2406                             nlh->nlmsg_seq, 0, 0, 0);
2407         if (err < 0) {
2408                 kfree_skb(skb);
2409                 goto errout;
2410         }
2411
2412         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2413 errout:
2414         return err;
2415 }
2416
2417 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2418 {
2419         struct sk_buff *skb;
2420         struct net *net = info->nl_net;
2421         u32 seq;
2422         int err;
2423
2424         err = -ENOBUFS;
2425         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2426
2427         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2428         if (skb == NULL)
2429                 goto errout;
2430
2431         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2432                                 event, info->pid, seq, 0, 0, 0);
2433         if (err < 0) {
2434                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2435                 WARN_ON(err == -EMSGSIZE);
2436                 kfree_skb(skb);
2437                 goto errout;
2438         }
2439         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2440                     info->nlh, gfp_any());
2441         return;
2442 errout:
2443         if (err < 0)
2444                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2445 }
2446
2447 static int ip6_route_dev_notify(struct notifier_block *this,
2448                                 unsigned long event, void *data)
2449 {
2450         struct net_device *dev = (struct net_device *)data;
2451         struct net *net = dev_net(dev);
2452
2453         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2454                 net->ipv6.ip6_null_entry->dst.dev = dev;
2455                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2456 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2457                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2458                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2459                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2460                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2461 #endif
2462         }
2463
2464         return NOTIFY_OK;
2465 }
2466
2467 /*
2468  *      /proc
2469  */
2470
2471 #ifdef CONFIG_PROC_FS
2472
2473 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2474
2475 struct rt6_proc_arg
2476 {
2477         char *buffer;
2478         int offset;
2479         int length;
2480         int skip;
2481         int len;
2482 };
2483
2484 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2485 {
2486         struct seq_file *m = p_arg;
2487
2488         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2489
2490 #ifdef CONFIG_IPV6_SUBTREES
2491         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2492 #else
2493         seq_puts(m, "00000000000000000000000000000000 00 ");
2494 #endif
2495
2496         if (rt->rt6i_nexthop) {
2497                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2498         } else {
2499                 seq_puts(m, "00000000000000000000000000000000");
2500         }
2501         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2502                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2503                    rt->dst.__use, rt->rt6i_flags,
2504                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2505         return 0;
2506 }
2507
2508 static int ipv6_route_show(struct seq_file *m, void *v)
2509 {
2510         struct net *net = (struct net *)m->private;
2511         fib6_clean_all(net, rt6_info_route, 0, m);
2512         return 0;
2513 }
2514
2515 static int ipv6_route_open(struct inode *inode, struct file *file)
2516 {
2517         return single_open_net(inode, file, ipv6_route_show);
2518 }
2519
2520 static const struct file_operations ipv6_route_proc_fops = {
2521         .owner          = THIS_MODULE,
2522         .open           = ipv6_route_open,
2523         .read           = seq_read,
2524         .llseek         = seq_lseek,
2525         .release        = single_release_net,
2526 };
2527
2528 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2529 {
2530         struct net *net = (struct net *)seq->private;
2531         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2532                    net->ipv6.rt6_stats->fib_nodes,
2533                    net->ipv6.rt6_stats->fib_route_nodes,
2534                    net->ipv6.rt6_stats->fib_rt_alloc,
2535                    net->ipv6.rt6_stats->fib_rt_entries,
2536                    net->ipv6.rt6_stats->fib_rt_cache,
2537                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2538                    net->ipv6.rt6_stats->fib_discarded_routes);
2539
2540         return 0;
2541 }
2542
2543 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2544 {
2545         return single_open_net(inode, file, rt6_stats_seq_show);
2546 }
2547
2548 static const struct file_operations rt6_stats_seq_fops = {
2549         .owner   = THIS_MODULE,
2550         .open    = rt6_stats_seq_open,
2551         .read    = seq_read,
2552         .llseek  = seq_lseek,
2553         .release = single_release_net,
2554 };
2555 #endif  /* CONFIG_PROC_FS */
2556
2557 #ifdef CONFIG_SYSCTL
2558
2559 static
2560 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2561                               void __user *buffer, size_t *lenp, loff_t *ppos)
2562 {
2563         struct net *net = current->nsproxy->net_ns;
2564         int delay = net->ipv6.sysctl.flush_delay;
2565         if (write) {
2566                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2567                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2568                 return 0;
2569         } else
2570                 return -EINVAL;
2571 }
2572
2573 ctl_table ipv6_route_table_template[] = {
2574         {
2575                 .procname       =       "flush",
2576                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2577                 .maxlen         =       sizeof(int),
2578                 .mode           =       0200,
2579                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2580         },
2581         {
2582                 .procname       =       "gc_thresh",
2583                 .data           =       &ip6_dst_ops_template.gc_thresh,
2584                 .maxlen         =       sizeof(int),
2585                 .mode           =       0644,
2586                 .proc_handler   =       proc_dointvec,
2587         },
2588         {
2589                 .procname       =       "max_size",
2590                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2591                 .maxlen         =       sizeof(int),
2592                 .mode           =       0644,
2593                 .proc_handler   =       proc_dointvec,
2594         },
2595         {
2596                 .procname       =       "gc_min_interval",
2597                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2598                 .maxlen         =       sizeof(int),
2599                 .mode           =       0644,
2600                 .proc_handler   =       proc_dointvec_jiffies,
2601         },
2602         {
2603                 .procname       =       "gc_timeout",
2604                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2605                 .maxlen         =       sizeof(int),
2606                 .mode           =       0644,
2607                 .proc_handler   =       proc_dointvec_jiffies,
2608         },
2609         {
2610                 .procname       =       "gc_interval",
2611                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2612                 .maxlen         =       sizeof(int),
2613                 .mode           =       0644,
2614                 .proc_handler   =       proc_dointvec_jiffies,
2615         },
2616         {
2617                 .procname       =       "gc_elasticity",
2618                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2619                 .maxlen         =       sizeof(int),
2620                 .mode           =       0644,
2621                 .proc_handler   =       proc_dointvec,
2622         },
2623         {
2624                 .procname       =       "mtu_expires",
2625                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2626                 .maxlen         =       sizeof(int),
2627                 .mode           =       0644,
2628                 .proc_handler   =       proc_dointvec_jiffies,
2629         },
2630         {
2631                 .procname       =       "min_adv_mss",
2632                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2633                 .maxlen         =       sizeof(int),
2634                 .mode           =       0644,
2635                 .proc_handler   =       proc_dointvec,
2636         },
2637         {
2638                 .procname       =       "gc_min_interval_ms",
2639                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2640                 .maxlen         =       sizeof(int),
2641                 .mode           =       0644,
2642                 .proc_handler   =       proc_dointvec_ms_jiffies,
2643         },
2644         { }
2645 };
2646
2647 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2648 {
2649         struct ctl_table *table;
2650
2651         table = kmemdup(ipv6_route_table_template,
2652                         sizeof(ipv6_route_table_template),
2653                         GFP_KERNEL);
2654
2655         if (table) {
2656                 table[0].data = &net->ipv6.sysctl.flush_delay;
2657                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2658                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2659                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2660                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2661                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2662                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2663                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2664                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2665                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2666         }
2667
2668         return table;
2669 }
2670 #endif
2671
2672 static int __net_init ip6_route_net_init(struct net *net)
2673 {
2674         int ret = -ENOMEM;
2675
2676         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2677                sizeof(net->ipv6.ip6_dst_ops));
2678
2679         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2680                 goto out_ip6_dst_ops;
2681
2682         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2683                                            sizeof(*net->ipv6.ip6_null_entry),
2684                                            GFP_KERNEL);
2685         if (!net->ipv6.ip6_null_entry)
2686                 goto out_ip6_dst_entries;
2687         net->ipv6.ip6_null_entry->dst.path =
2688                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2689         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2690
2691 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2692         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2693                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2694                                                GFP_KERNEL);
2695         if (!net->ipv6.ip6_prohibit_entry)
2696                 goto out_ip6_null_entry;
2697         net->ipv6.ip6_prohibit_entry->dst.path =
2698                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2699         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2700
2701         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2702                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2703                                                GFP_KERNEL);
2704         if (!net->ipv6.ip6_blk_hole_entry)
2705                 goto out_ip6_prohibit_entry;
2706         net->ipv6.ip6_blk_hole_entry->dst.path =
2707                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2708         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2709 #endif
2710
2711         net->ipv6.sysctl.flush_delay = 0;
2712         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2713         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2714         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2715         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2716         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2717         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2718         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2719
2720 #ifdef CONFIG_PROC_FS
2721         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2722         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2723 #endif
2724         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2725
2726         ret = 0;
2727 out:
2728         return ret;
2729
2730 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2731 out_ip6_prohibit_entry:
2732         kfree(net->ipv6.ip6_prohibit_entry);
2733 out_ip6_null_entry:
2734         kfree(net->ipv6.ip6_null_entry);
2735 #endif
2736 out_ip6_dst_entries:
2737         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2738 out_ip6_dst_ops:
2739         goto out;
2740 }
2741
2742 static void __net_exit ip6_route_net_exit(struct net *net)
2743 {
2744 #ifdef CONFIG_PROC_FS
2745         proc_net_remove(net, "ipv6_route");
2746         proc_net_remove(net, "rt6_stats");
2747 #endif
2748         kfree(net->ipv6.ip6_null_entry);
2749 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2750         kfree(net->ipv6.ip6_prohibit_entry);
2751         kfree(net->ipv6.ip6_blk_hole_entry);
2752 #endif
2753         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2754 }
2755
2756 static struct pernet_operations ip6_route_net_ops = {
2757         .init = ip6_route_net_init,
2758         .exit = ip6_route_net_exit,
2759 };
2760
2761 static struct notifier_block ip6_route_dev_notifier = {
2762         .notifier_call = ip6_route_dev_notify,
2763         .priority = 0,
2764 };
2765
2766 int __init ip6_route_init(void)
2767 {
2768         int ret;
2769
2770         ret = -ENOMEM;
2771         ip6_dst_ops_template.kmem_cachep =
2772                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2773                                   SLAB_HWCACHE_ALIGN, NULL);
2774         if (!ip6_dst_ops_template.kmem_cachep)
2775                 goto out;
2776
2777         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2778         if (ret)
2779                 goto out_kmem_cache;
2780
2781         ret = register_pernet_subsys(&ip6_route_net_ops);
2782         if (ret)
2783                 goto out_dst_entries;
2784
2785         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2786
2787         /* Registering of the loopback is done before this portion of code,
2788          * the loopback reference in rt6_info will not be taken, do it
2789          * manually for init_net */
2790         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2791         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2792   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2793         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2794         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2795         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2796         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2797   #endif
2798         ret = fib6_init();
2799         if (ret)
2800                 goto out_register_subsys;
2801
2802         ret = xfrm6_init();
2803         if (ret)
2804                 goto out_fib6_init;
2805
2806         ret = fib6_rules_init();
2807         if (ret)
2808                 goto xfrm6_init;
2809
2810         ret = -ENOBUFS;
2811         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2812             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2813             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2814                 goto fib6_rules_init;
2815
2816         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2817         if (ret)
2818                 goto fib6_rules_init;
2819
2820 out:
2821         return ret;
2822
2823 fib6_rules_init:
2824         fib6_rules_cleanup();
2825 xfrm6_init:
2826         xfrm6_fini();
2827 out_fib6_init:
2828         fib6_gc_cleanup();
2829 out_register_subsys:
2830         unregister_pernet_subsys(&ip6_route_net_ops);
2831 out_dst_entries:
2832         dst_entries_destroy(&ip6_dst_blackhole_ops);
2833 out_kmem_cache:
2834         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2835         goto out;
2836 }
2837
2838 void ip6_route_cleanup(void)
2839 {
2840         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2841         fib6_rules_cleanup();
2842         xfrm6_fini();
2843         fib6_gc_cleanup();
2844         unregister_pernet_subsys(&ip6_route_net_ops);
2845         dst_entries_destroy(&ip6_dst_blackhole_ops);
2846         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2847 }