]> rtime.felk.cvut.cz Git - sojka/nv-tegra/linux-3.10.git/blob - net/ipv4/route.c
Linux-2.6.12-rc2
[sojka/nv-tegra/linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *
58  *              This program is free software; you can redistribute it and/or
59  *              modify it under the terms of the GNU General Public License
60  *              as published by the Free Software Foundation; either version
61  *              2 of the License, or (at your option) any later version.
62  */
63
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/ip_mp_alg.h>
104 #ifdef CONFIG_SYSCTL
105 #include <linux/sysctl.h>
106 #endif
107
108 #define RT_FL_TOS(oldflp) \
109     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
110
111 #define IP_MAX_MTU      0xFFF0
112
113 #define RT_GC_TIMEOUT (300*HZ)
114
115 static int ip_rt_min_delay              = 2 * HZ;
116 static int ip_rt_max_delay              = 10 * HZ;
117 static int ip_rt_max_size;
118 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
119 static int ip_rt_gc_interval            = 60 * HZ;
120 static int ip_rt_gc_min_interval        = HZ / 2;
121 static int ip_rt_redirect_number        = 9;
122 static int ip_rt_redirect_load          = HZ / 50;
123 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost             = HZ;
125 static int ip_rt_error_burst            = 5 * HZ;
126 static int ip_rt_gc_elasticity          = 8;
127 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
128 static int ip_rt_min_pmtu               = 512 + 20 + 20;
129 static int ip_rt_min_advmss             = 256;
130 static int ip_rt_secret_interval        = 10 * 60 * HZ;
131 static unsigned long rt_deadline;
132
133 #define RTprint(a...)   printk(KERN_DEBUG a)
134
135 static struct timer_list rt_flush_timer;
136 static struct timer_list rt_periodic_timer;
137 static struct timer_list rt_secret_timer;
138
139 /*
140  *      Interface to generic destination cache.
141  */
142
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void              ipv4_dst_destroy(struct dst_entry *dst);
145 static void              ipv4_dst_ifdown(struct dst_entry *dst,
146                                          struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void              ipv4_link_failure(struct sk_buff *skb);
149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(void);
151
152
153 static struct dst_ops ipv4_dst_ops = {
154         .family =               AF_INET,
155         .protocol =             __constant_htons(ETH_P_IP),
156         .gc =                   rt_garbage_collect,
157         .check =                ipv4_dst_check,
158         .destroy =              ipv4_dst_destroy,
159         .ifdown =               ipv4_dst_ifdown,
160         .negative_advice =      ipv4_negative_advice,
161         .link_failure =         ipv4_link_failure,
162         .update_pmtu =          ip_rt_update_pmtu,
163         .entry_size =           sizeof(struct rtable),
164 };
165
166 #define ECN_OR_COST(class)      TC_PRIO_##class
167
168 __u8 ip_tos2prio[16] = {
169         TC_PRIO_BESTEFFORT,
170         ECN_OR_COST(FILLER),
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BULK,
174         ECN_OR_COST(BULK),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_INTERACTIVE,
178         ECN_OR_COST(INTERACTIVE),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE_BULK,
182         ECN_OR_COST(INTERACTIVE_BULK),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK)
185 };
186
187
188 /*
189  * Route cache.
190  */
191
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201
202 struct rt_hash_bucket {
203         struct rtable   *chain;
204         spinlock_t      lock;
205 } __attribute__((__aligned__(8)));
206
207 static struct rt_hash_bucket    *rt_hash_table;
208 static unsigned                 rt_hash_mask;
209 static int                      rt_hash_log;
210 static unsigned int             rt_hash_rnd;
211
212 struct rt_cache_stat *rt_cache_stat;
213
214 static int rt_intern_hash(unsigned hash, struct rtable *rth,
215                                 struct rtable **res);
216
217 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
218 {
219         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
220                 & rt_hash_mask);
221 }
222
223 #ifdef CONFIG_PROC_FS
224 struct rt_cache_iter_state {
225         int bucket;
226 };
227
228 static struct rtable *rt_cache_get_first(struct seq_file *seq)
229 {
230         struct rtable *r = NULL;
231         struct rt_cache_iter_state *st = seq->private;
232
233         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
234                 rcu_read_lock_bh();
235                 r = rt_hash_table[st->bucket].chain;
236                 if (r)
237                         break;
238                 rcu_read_unlock_bh();
239         }
240         return r;
241 }
242
243 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
244 {
245         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
246
247         r = r->u.rt_next;
248         while (!r) {
249                 rcu_read_unlock_bh();
250                 if (--st->bucket < 0)
251                         break;
252                 rcu_read_lock_bh();
253                 r = rt_hash_table[st->bucket].chain;
254         }
255         return r;
256 }
257
258 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
259 {
260         struct rtable *r = rt_cache_get_first(seq);
261
262         if (r)
263                 while (pos && (r = rt_cache_get_next(seq, r)))
264                         --pos;
265         return pos ? NULL : r;
266 }
267
268 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
269 {
270         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
271 }
272
273 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
274 {
275         struct rtable *r = NULL;
276
277         if (v == SEQ_START_TOKEN)
278                 r = rt_cache_get_first(seq);
279         else
280                 r = rt_cache_get_next(seq, v);
281         ++*pos;
282         return r;
283 }
284
285 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
286 {
287         if (v && v != SEQ_START_TOKEN)
288                 rcu_read_unlock_bh();
289 }
290
291 static int rt_cache_seq_show(struct seq_file *seq, void *v)
292 {
293         if (v == SEQ_START_TOKEN)
294                 seq_printf(seq, "%-127s\n",
295                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
296                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
297                            "HHUptod\tSpecDst");
298         else {
299                 struct rtable *r = v;
300                 char temp[256];
301
302                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
303                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
304                         r->u.dst.dev ? r->u.dst.dev->name : "*",
305                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
306                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
307                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
308                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
309                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
310                         dst_metric(&r->u.dst, RTAX_WINDOW),
311                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
312                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
313                         r->fl.fl4_tos,
314                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
315                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
316                                        dev_queue_xmit) : 0,
317                         r->rt_spec_dst);
318                 seq_printf(seq, "%-127s\n", temp);
319         }
320         return 0;
321 }
322
323 static struct seq_operations rt_cache_seq_ops = {
324         .start  = rt_cache_seq_start,
325         .next   = rt_cache_seq_next,
326         .stop   = rt_cache_seq_stop,
327         .show   = rt_cache_seq_show,
328 };
329
330 static int rt_cache_seq_open(struct inode *inode, struct file *file)
331 {
332         struct seq_file *seq;
333         int rc = -ENOMEM;
334         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
335
336         if (!s)
337                 goto out;
338         rc = seq_open(file, &rt_cache_seq_ops);
339         if (rc)
340                 goto out_kfree;
341         seq          = file->private_data;
342         seq->private = s;
343         memset(s, 0, sizeof(*s));
344 out:
345         return rc;
346 out_kfree:
347         kfree(s);
348         goto out;
349 }
350
351 static struct file_operations rt_cache_seq_fops = {
352         .owner   = THIS_MODULE,
353         .open    = rt_cache_seq_open,
354         .read    = seq_read,
355         .llseek  = seq_lseek,
356         .release = seq_release_private,
357 };
358
359
360 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
361 {
362         int cpu;
363
364         if (*pos == 0)
365                 return SEQ_START_TOKEN;
366
367         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
368                 if (!cpu_possible(cpu))
369                         continue;
370                 *pos = cpu+1;
371                 return per_cpu_ptr(rt_cache_stat, cpu);
372         }
373         return NULL;
374 }
375
376 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
377 {
378         int cpu;
379
380         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
381                 if (!cpu_possible(cpu))
382                         continue;
383                 *pos = cpu+1;
384                 return per_cpu_ptr(rt_cache_stat, cpu);
385         }
386         return NULL;
387         
388 }
389
390 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
391 {
392
393 }
394
395 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
396 {
397         struct rt_cache_stat *st = v;
398
399         if (v == SEQ_START_TOKEN) {
400                 seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
401                 return 0;
402         }
403         
404         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
405                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
406                    atomic_read(&ipv4_dst_ops.entries),
407                    st->in_hit,
408                    st->in_slow_tot,
409                    st->in_slow_mc,
410                    st->in_no_route,
411                    st->in_brd,
412                    st->in_martian_dst,
413                    st->in_martian_src,
414
415                    st->out_hit,
416                    st->out_slow_tot,
417                    st->out_slow_mc, 
418
419                    st->gc_total,
420                    st->gc_ignored,
421                    st->gc_goal_miss,
422                    st->gc_dst_overflow,
423                    st->in_hlist_search,
424                    st->out_hlist_search
425                 );
426         return 0;
427 }
428
429 static struct seq_operations rt_cpu_seq_ops = {
430         .start  = rt_cpu_seq_start,
431         .next   = rt_cpu_seq_next,
432         .stop   = rt_cpu_seq_stop,
433         .show   = rt_cpu_seq_show,
434 };
435
436
437 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
438 {
439         return seq_open(file, &rt_cpu_seq_ops);
440 }
441
442 static struct file_operations rt_cpu_seq_fops = {
443         .owner   = THIS_MODULE,
444         .open    = rt_cpu_seq_open,
445         .read    = seq_read,
446         .llseek  = seq_lseek,
447         .release = seq_release,
448 };
449
450 #endif /* CONFIG_PROC_FS */
451   
452 static __inline__ void rt_free(struct rtable *rt)
453 {
454         multipath_remove(rt);
455         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
456 }
457
458 static __inline__ void rt_drop(struct rtable *rt)
459 {
460         multipath_remove(rt);
461         ip_rt_put(rt);
462         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
463 }
464
465 static __inline__ int rt_fast_clean(struct rtable *rth)
466 {
467         /* Kill broadcast/multicast entries very aggresively, if they
468            collide in hash table with more useful entries */
469         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
470                 rth->fl.iif && rth->u.rt_next;
471 }
472
473 static __inline__ int rt_valuable(struct rtable *rth)
474 {
475         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
476                 rth->u.dst.expires;
477 }
478
479 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
480 {
481         unsigned long age;
482         int ret = 0;
483
484         if (atomic_read(&rth->u.dst.__refcnt))
485                 goto out;
486
487         ret = 1;
488         if (rth->u.dst.expires &&
489             time_after_eq(jiffies, rth->u.dst.expires))
490                 goto out;
491
492         age = jiffies - rth->u.dst.lastuse;
493         ret = 0;
494         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
495             (age <= tmo2 && rt_valuable(rth)))
496                 goto out;
497         ret = 1;
498 out:    return ret;
499 }
500
501 /* Bits of score are:
502  * 31: very valuable
503  * 30: not quite useless
504  * 29..0: usage counter
505  */
506 static inline u32 rt_score(struct rtable *rt)
507 {
508         u32 score = jiffies - rt->u.dst.lastuse;
509
510         score = ~score & ~(3<<30);
511
512         if (rt_valuable(rt))
513                 score |= (1<<31);
514
515         if (!rt->fl.iif ||
516             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
517                 score |= (1<<30);
518
519         return score;
520 }
521
522 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
523 {
524         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
525                fl1->oif     == fl2->oif &&
526                fl1->iif     == fl2->iif;
527 }
528
529 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
530 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
531                                                 struct rtable *expentry,
532                                                 int *removed_count)
533 {
534         int passedexpired = 0;
535         struct rtable **nextstep = NULL;
536         struct rtable **rthp = chain_head;
537         struct rtable *rth;
538
539         if (removed_count)
540                 *removed_count = 0;
541
542         while ((rth = *rthp) != NULL) {
543                 if (rth == expentry)
544                         passedexpired = 1;
545
546                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
547                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
548                         if (*rthp == expentry) {
549                                 *rthp = rth->u.rt_next;
550                                 continue;
551                         } else {
552                                 *rthp = rth->u.rt_next;
553                                 rt_free(rth);
554                                 if (removed_count)
555                                         ++(*removed_count);
556                         }
557                 } else {
558                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
559                             passedexpired && !nextstep)
560                                 nextstep = &rth->u.rt_next;
561
562                         rthp = &rth->u.rt_next;
563                 }
564         }
565
566         rt_free(expentry);
567         if (removed_count)
568                 ++(*removed_count);
569
570         return nextstep;
571 }
572 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
573
574
575 /* This runs via a timer and thus is always in BH context. */
576 static void rt_check_expire(unsigned long dummy)
577 {
578         static int rover;
579         int i = rover, t;
580         struct rtable *rth, **rthp;
581         unsigned long now = jiffies;
582
583         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
584              t -= ip_rt_gc_timeout) {
585                 unsigned long tmo = ip_rt_gc_timeout;
586
587                 i = (i + 1) & rt_hash_mask;
588                 rthp = &rt_hash_table[i].chain;
589
590                 spin_lock(&rt_hash_table[i].lock);
591                 while ((rth = *rthp) != NULL) {
592                         if (rth->u.dst.expires) {
593                                 /* Entry is expired even if it is in use */
594                                 if (time_before_eq(now, rth->u.dst.expires)) {
595                                         tmo >>= 1;
596                                         rthp = &rth->u.rt_next;
597                                         continue;
598                                 }
599                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
600                                 tmo >>= 1;
601                                 rthp = &rth->u.rt_next;
602                                 continue;
603                         }
604
605                         /* Cleanup aged off entries. */
606 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
607                         /* remove all related balanced entries if necessary */
608                         if (rth->u.dst.flags & DST_BALANCED) {
609                                 rthp = rt_remove_balanced_route(
610                                         &rt_hash_table[i].chain,
611                                         rth, NULL);
612                                 if (!rthp)
613                                         break;
614                         } else {
615                                 *rthp = rth->u.rt_next;
616                                 rt_free(rth);
617                         }
618 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
619                         *rthp = rth->u.rt_next;
620                         rt_free(rth);
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622                 }
623                 spin_unlock(&rt_hash_table[i].lock);
624
625                 /* Fallback loop breaker. */
626                 if (time_after(jiffies, now))
627                         break;
628         }
629         rover = i;
630         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
631 }
632
633 /* This can run from both BH and non-BH contexts, the latter
634  * in the case of a forced flush event.
635  */
636 static void rt_run_flush(unsigned long dummy)
637 {
638         int i;
639         struct rtable *rth, *next;
640
641         rt_deadline = 0;
642
643         get_random_bytes(&rt_hash_rnd, 4);
644
645         for (i = rt_hash_mask; i >= 0; i--) {
646                 spin_lock_bh(&rt_hash_table[i].lock);
647                 rth = rt_hash_table[i].chain;
648                 if (rth)
649                         rt_hash_table[i].chain = NULL;
650                 spin_unlock_bh(&rt_hash_table[i].lock);
651
652                 for (; rth; rth = next) {
653                         next = rth->u.rt_next;
654                         rt_free(rth);
655                 }
656         }
657 }
658
659 static DEFINE_SPINLOCK(rt_flush_lock);
660
661 void rt_cache_flush(int delay)
662 {
663         unsigned long now = jiffies;
664         int user_mode = !in_softirq();
665
666         if (delay < 0)
667                 delay = ip_rt_min_delay;
668
669         /* flush existing multipath state*/
670         multipath_flush();
671
672         spin_lock_bh(&rt_flush_lock);
673
674         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
675                 long tmo = (long)(rt_deadline - now);
676
677                 /* If flush timer is already running
678                    and flush request is not immediate (delay > 0):
679
680                    if deadline is not achieved, prolongate timer to "delay",
681                    otherwise fire it at deadline time.
682                  */
683
684                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
685                         tmo = 0;
686                 
687                 if (delay > tmo)
688                         delay = tmo;
689         }
690
691         if (delay <= 0) {
692                 spin_unlock_bh(&rt_flush_lock);
693                 rt_run_flush(0);
694                 return;
695         }
696
697         if (rt_deadline == 0)
698                 rt_deadline = now + ip_rt_max_delay;
699
700         mod_timer(&rt_flush_timer, now+delay);
701         spin_unlock_bh(&rt_flush_lock);
702 }
703
704 static void rt_secret_rebuild(unsigned long dummy)
705 {
706         unsigned long now = jiffies;
707
708         rt_cache_flush(0);
709         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
710 }
711
712 /*
713    Short description of GC goals.
714
715    We want to build algorithm, which will keep routing cache
716    at some equilibrium point, when number of aged off entries
717    is kept approximately equal to newly generated ones.
718
719    Current expiration strength is variable "expire".
720    We try to adjust it dynamically, so that if networking
721    is idle expires is large enough to keep enough of warm entries,
722    and when load increases it reduces to limit cache size.
723  */
724
725 static int rt_garbage_collect(void)
726 {
727         static unsigned long expire = RT_GC_TIMEOUT;
728         static unsigned long last_gc;
729         static int rover;
730         static int equilibrium;
731         struct rtable *rth, **rthp;
732         unsigned long now = jiffies;
733         int goal;
734
735         /*
736          * Garbage collection is pretty expensive,
737          * do not make it too frequently.
738          */
739
740         RT_CACHE_STAT_INC(gc_total);
741
742         if (now - last_gc < ip_rt_gc_min_interval &&
743             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
744                 RT_CACHE_STAT_INC(gc_ignored);
745                 goto out;
746         }
747
748         /* Calculate number of entries, which we want to expire now. */
749         goal = atomic_read(&ipv4_dst_ops.entries) -
750                 (ip_rt_gc_elasticity << rt_hash_log);
751         if (goal <= 0) {
752                 if (equilibrium < ipv4_dst_ops.gc_thresh)
753                         equilibrium = ipv4_dst_ops.gc_thresh;
754                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
755                 if (goal > 0) {
756                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
757                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
758                 }
759         } else {
760                 /* We are in dangerous area. Try to reduce cache really
761                  * aggressively.
762                  */
763                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
764                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
765         }
766
767         if (now - last_gc >= ip_rt_gc_min_interval)
768                 last_gc = now;
769
770         if (goal <= 0) {
771                 equilibrium += goal;
772                 goto work_done;
773         }
774
775         do {
776                 int i, k;
777
778                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
779                         unsigned long tmo = expire;
780
781                         k = (k + 1) & rt_hash_mask;
782                         rthp = &rt_hash_table[k].chain;
783                         spin_lock_bh(&rt_hash_table[k].lock);
784                         while ((rth = *rthp) != NULL) {
785                                 if (!rt_may_expire(rth, tmo, expire)) {
786                                         tmo >>= 1;
787                                         rthp = &rth->u.rt_next;
788                                         continue;
789                                 }
790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
791                                 /* remove all related balanced entries
792                                  * if necessary
793                                  */
794                                 if (rth->u.dst.flags & DST_BALANCED) {
795                                         int r;
796
797                                         rthp = rt_remove_balanced_route(
798                                                 &rt_hash_table[i].chain,
799                                                 rth,
800                                                 &r);
801                                         goal -= r;
802                                         if (!rthp)
803                                                 break;
804                                 } else {
805                                         *rthp = rth->u.rt_next;
806                                         rt_free(rth);
807                                         goal--;
808                                 }
809 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
810                                 *rthp = rth->u.rt_next;
811                                 rt_free(rth);
812                                 goal--;
813 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814                         }
815                         spin_unlock_bh(&rt_hash_table[k].lock);
816                         if (goal <= 0)
817                                 break;
818                 }
819                 rover = k;
820
821                 if (goal <= 0)
822                         goto work_done;
823
824                 /* Goal is not achieved. We stop process if:
825
826                    - if expire reduced to zero. Otherwise, expire is halfed.
827                    - if table is not full.
828                    - if we are called from interrupt.
829                    - jiffies check is just fallback/debug loop breaker.
830                      We will not spin here for long time in any case.
831                  */
832
833                 RT_CACHE_STAT_INC(gc_goal_miss);
834
835                 if (expire == 0)
836                         break;
837
838                 expire >>= 1;
839 #if RT_CACHE_DEBUG >= 2
840                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
841                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
842 #endif
843
844                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
845                         goto out;
846         } while (!in_softirq() && time_before_eq(jiffies, now));
847
848         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
849                 goto out;
850         if (net_ratelimit())
851                 printk(KERN_WARNING "dst cache overflow\n");
852         RT_CACHE_STAT_INC(gc_dst_overflow);
853         return 1;
854
855 work_done:
856         expire += ip_rt_gc_min_interval;
857         if (expire > ip_rt_gc_timeout ||
858             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
859                 expire = ip_rt_gc_timeout;
860 #if RT_CACHE_DEBUG >= 2
861         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
862                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
863 #endif
864 out:    return 0;
865 }
866
867 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
868 {
869         struct rtable   *rth, **rthp;
870         unsigned long   now;
871         struct rtable *cand, **candp;
872         u32             min_score;
873         int             chain_length;
874         int attempts = !in_softirq();
875
876 restart:
877         chain_length = 0;
878         min_score = ~(u32)0;
879         cand = NULL;
880         candp = NULL;
881         now = jiffies;
882
883         rthp = &rt_hash_table[hash].chain;
884
885         spin_lock_bh(&rt_hash_table[hash].lock);
886         while ((rth = *rthp) != NULL) {
887 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888                 if (!(rth->u.dst.flags & DST_BALANCED) &&
889                     compare_keys(&rth->fl, &rt->fl)) {
890 #else
891                 if (compare_keys(&rth->fl, &rt->fl)) {
892 #endif
893                         /* Put it first */
894                         *rthp = rth->u.rt_next;
895                         /*
896                          * Since lookup is lockfree, the deletion
897                          * must be visible to another weakly ordered CPU before
898                          * the insertion at the start of the hash chain.
899                          */
900                         rcu_assign_pointer(rth->u.rt_next,
901                                            rt_hash_table[hash].chain);
902                         /*
903                          * Since lookup is lockfree, the update writes
904                          * must be ordered for consistency on SMP.
905                          */
906                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
907
908                         rth->u.dst.__use++;
909                         dst_hold(&rth->u.dst);
910                         rth->u.dst.lastuse = now;
911                         spin_unlock_bh(&rt_hash_table[hash].lock);
912
913                         rt_drop(rt);
914                         *rp = rth;
915                         return 0;
916                 }
917
918                 if (!atomic_read(&rth->u.dst.__refcnt)) {
919                         u32 score = rt_score(rth);
920
921                         if (score <= min_score) {
922                                 cand = rth;
923                                 candp = rthp;
924                                 min_score = score;
925                         }
926                 }
927
928                 chain_length++;
929
930                 rthp = &rth->u.rt_next;
931         }
932
933         if (cand) {
934                 /* ip_rt_gc_elasticity used to be average length of chain
935                  * length, when exceeded gc becomes really aggressive.
936                  *
937                  * The second limit is less certain. At the moment it allows
938                  * only 2 entries per bucket. We will see.
939                  */
940                 if (chain_length > ip_rt_gc_elasticity) {
941                         *candp = cand->u.rt_next;
942                         rt_free(cand);
943                 }
944         }
945
946         /* Try to bind route to arp only if it is output
947            route or unicast forwarding path.
948          */
949         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950                 int err = arp_bind_neighbour(&rt->u.dst);
951                 if (err) {
952                         spin_unlock_bh(&rt_hash_table[hash].lock);
953
954                         if (err != -ENOBUFS) {
955                                 rt_drop(rt);
956                                 return err;
957                         }
958
959                         /* Neighbour tables are full and nothing
960                            can be released. Try to shrink route cache,
961                            it is most likely it holds some neighbour records.
962                          */
963                         if (attempts-- > 0) {
964                                 int saved_elasticity = ip_rt_gc_elasticity;
965                                 int saved_int = ip_rt_gc_min_interval;
966                                 ip_rt_gc_elasticity     = 1;
967                                 ip_rt_gc_min_interval   = 0;
968                                 rt_garbage_collect();
969                                 ip_rt_gc_min_interval   = saved_int;
970                                 ip_rt_gc_elasticity     = saved_elasticity;
971                                 goto restart;
972                         }
973
974                         if (net_ratelimit())
975                                 printk(KERN_WARNING "Neighbour table overflow.\n");
976                         rt_drop(rt);
977                         return -ENOBUFS;
978                 }
979         }
980
981         rt->u.rt_next = rt_hash_table[hash].chain;
982 #if RT_CACHE_DEBUG >= 2
983         if (rt->u.rt_next) {
984                 struct rtable *trt;
985                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
986                        NIPQUAD(rt->rt_dst));
987                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
988                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
989                 printk("\n");
990         }
991 #endif
992         rt_hash_table[hash].chain = rt;
993         spin_unlock_bh(&rt_hash_table[hash].lock);
994         *rp = rt;
995         return 0;
996 }
997
998 void rt_bind_peer(struct rtable *rt, int create)
999 {
1000         static DEFINE_SPINLOCK(rt_peer_lock);
1001         struct inet_peer *peer;
1002
1003         peer = inet_getpeer(rt->rt_dst, create);
1004
1005         spin_lock_bh(&rt_peer_lock);
1006         if (rt->peer == NULL) {
1007                 rt->peer = peer;
1008                 peer = NULL;
1009         }
1010         spin_unlock_bh(&rt_peer_lock);
1011         if (peer)
1012                 inet_putpeer(peer);
1013 }
1014
1015 /*
1016  * Peer allocation may fail only in serious out-of-memory conditions.  However
1017  * we still can generate some output.
1018  * Random ID selection looks a bit dangerous because we have no chances to
1019  * select ID being unique in a reasonable period of time.
1020  * But broken packet identifier may be better than no packet at all.
1021  */
1022 static void ip_select_fb_ident(struct iphdr *iph)
1023 {
1024         static DEFINE_SPINLOCK(ip_fb_id_lock);
1025         static u32 ip_fallback_id;
1026         u32 salt;
1027
1028         spin_lock_bh(&ip_fb_id_lock);
1029         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1030         iph->id = htons(salt & 0xFFFF);
1031         ip_fallback_id = salt;
1032         spin_unlock_bh(&ip_fb_id_lock);
1033 }
1034
1035 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1036 {
1037         struct rtable *rt = (struct rtable *) dst;
1038
1039         if (rt) {
1040                 if (rt->peer == NULL)
1041                         rt_bind_peer(rt, 1);
1042
1043                 /* If peer is attached to destination, it is never detached,
1044                    so that we need not to grab a lock to dereference it.
1045                  */
1046                 if (rt->peer) {
1047                         iph->id = htons(inet_getid(rt->peer, more));
1048                         return;
1049                 }
1050         } else
1051                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
1052
1053         ip_select_fb_ident(iph);
1054 }
1055
1056 static void rt_del(unsigned hash, struct rtable *rt)
1057 {
1058         struct rtable **rthp;
1059
1060         spin_lock_bh(&rt_hash_table[hash].lock);
1061         ip_rt_put(rt);
1062         for (rthp = &rt_hash_table[hash].chain; *rthp;
1063              rthp = &(*rthp)->u.rt_next)
1064                 if (*rthp == rt) {
1065                         *rthp = rt->u.rt_next;
1066                         rt_free(rt);
1067                         break;
1068                 }
1069         spin_unlock_bh(&rt_hash_table[hash].lock);
1070 }
1071
1072 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1073                     u32 saddr, u8 tos, struct net_device *dev)
1074 {
1075         int i, k;
1076         struct in_device *in_dev = in_dev_get(dev);
1077         struct rtable *rth, **rthp;
1078         u32  skeys[2] = { saddr, 0 };
1079         int  ikeys[2] = { dev->ifindex, 0 };
1080
1081         tos &= IPTOS_RT_MASK;
1082
1083         if (!in_dev)
1084                 return;
1085
1086         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1087             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1088                 goto reject_redirect;
1089
1090         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1091                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1092                         goto reject_redirect;
1093                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1094                         goto reject_redirect;
1095         } else {
1096                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1097                         goto reject_redirect;
1098         }
1099
1100         for (i = 0; i < 2; i++) {
1101                 for (k = 0; k < 2; k++) {
1102                         unsigned hash = rt_hash_code(daddr,
1103                                                      skeys[i] ^ (ikeys[k] << 5),
1104                                                      tos);
1105
1106                         rthp=&rt_hash_table[hash].chain;
1107
1108                         rcu_read_lock();
1109                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1110                                 struct rtable *rt;
1111
1112                                 if (rth->fl.fl4_dst != daddr ||
1113                                     rth->fl.fl4_src != skeys[i] ||
1114                                     rth->fl.fl4_tos != tos ||
1115                                     rth->fl.oif != ikeys[k] ||
1116                                     rth->fl.iif != 0) {
1117                                         rthp = &rth->u.rt_next;
1118                                         continue;
1119                                 }
1120
1121                                 if (rth->rt_dst != daddr ||
1122                                     rth->rt_src != saddr ||
1123                                     rth->u.dst.error ||
1124                                     rth->rt_gateway != old_gw ||
1125                                     rth->u.dst.dev != dev)
1126                                         break;
1127
1128                                 dst_hold(&rth->u.dst);
1129                                 rcu_read_unlock();
1130
1131                                 rt = dst_alloc(&ipv4_dst_ops);
1132                                 if (rt == NULL) {
1133                                         ip_rt_put(rth);
1134                                         in_dev_put(in_dev);
1135                                         return;
1136                                 }
1137
1138                                 /* Copy all the information. */
1139                                 *rt = *rth;
1140                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1141                                 rt->u.dst.__use         = 1;
1142                                 atomic_set(&rt->u.dst.__refcnt, 1);
1143                                 rt->u.dst.child         = NULL;
1144                                 if (rt->u.dst.dev)
1145                                         dev_hold(rt->u.dst.dev);
1146                                 if (rt->idev)
1147                                         in_dev_hold(rt->idev);
1148                                 rt->u.dst.obsolete      = 0;
1149                                 rt->u.dst.lastuse       = jiffies;
1150                                 rt->u.dst.path          = &rt->u.dst;
1151                                 rt->u.dst.neighbour     = NULL;
1152                                 rt->u.dst.hh            = NULL;
1153                                 rt->u.dst.xfrm          = NULL;
1154
1155                                 rt->rt_flags            |= RTCF_REDIRECTED;
1156
1157                                 /* Gateway is different ... */
1158                                 rt->rt_gateway          = new_gw;
1159
1160                                 /* Redirect received -> path was valid */
1161                                 dst_confirm(&rth->u.dst);
1162
1163                                 if (rt->peer)
1164                                         atomic_inc(&rt->peer->refcnt);
1165
1166                                 if (arp_bind_neighbour(&rt->u.dst) ||
1167                                     !(rt->u.dst.neighbour->nud_state &
1168                                             NUD_VALID)) {
1169                                         if (rt->u.dst.neighbour)
1170                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1171                                         ip_rt_put(rth);
1172                                         rt_drop(rt);
1173                                         goto do_next;
1174                                 }
1175
1176                                 rt_del(hash, rth);
1177                                 if (!rt_intern_hash(hash, rt, &rt))
1178                                         ip_rt_put(rt);
1179                                 goto do_next;
1180                         }
1181                         rcu_read_unlock();
1182                 do_next:
1183                         ;
1184                 }
1185         }
1186         in_dev_put(in_dev);
1187         return;
1188
1189 reject_redirect:
1190 #ifdef CONFIG_IP_ROUTE_VERBOSE
1191         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1192                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1193                         "%u.%u.%u.%u ignored.\n"
1194                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1195                         "tos %02x\n",
1196                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1197                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1198 #endif
1199         in_dev_put(in_dev);
1200 }
1201
1202 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1203 {
1204         struct rtable *rt = (struct rtable*)dst;
1205         struct dst_entry *ret = dst;
1206
1207         if (rt) {
1208                 if (dst->obsolete) {
1209                         ip_rt_put(rt);
1210                         ret = NULL;
1211                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1212                            rt->u.dst.expires) {
1213                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1214                                                      rt->fl.fl4_src ^
1215                                                         (rt->fl.oif << 5),
1216                                                      rt->fl.fl4_tos);
1217 #if RT_CACHE_DEBUG >= 1
1218                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1219                                           "%u.%u.%u.%u/%02x dropped\n",
1220                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1221 #endif
1222                         rt_del(hash, rt);
1223                         ret = NULL;
1224                 }
1225         }
1226         return ret;
1227 }
1228
1229 /*
1230  * Algorithm:
1231  *      1. The first ip_rt_redirect_number redirects are sent
1232  *         with exponential backoff, then we stop sending them at all,
1233  *         assuming that the host ignores our redirects.
1234  *      2. If we did not see packets requiring redirects
1235  *         during ip_rt_redirect_silence, we assume that the host
1236  *         forgot redirected route and start to send redirects again.
1237  *
1238  * This algorithm is much cheaper and more intelligent than dumb load limiting
1239  * in icmp.c.
1240  *
1241  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1242  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1243  */
1244
1245 void ip_rt_send_redirect(struct sk_buff *skb)
1246 {
1247         struct rtable *rt = (struct rtable*)skb->dst;
1248         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1249
1250         if (!in_dev)
1251                 return;
1252
1253         if (!IN_DEV_TX_REDIRECTS(in_dev))
1254                 goto out;
1255
1256         /* No redirected packets during ip_rt_redirect_silence;
1257          * reset the algorithm.
1258          */
1259         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1260                 rt->u.dst.rate_tokens = 0;
1261
1262         /* Too many ignored redirects; do not send anything
1263          * set u.dst.rate_last to the last seen redirected packet.
1264          */
1265         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1266                 rt->u.dst.rate_last = jiffies;
1267                 goto out;
1268         }
1269
1270         /* Check for load limit; set rate_last to the latest sent
1271          * redirect.
1272          */
1273         if (time_after(jiffies,
1274                        (rt->u.dst.rate_last +
1275                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1276                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1277                 rt->u.dst.rate_last = jiffies;
1278                 ++rt->u.dst.rate_tokens;
1279 #ifdef CONFIG_IP_ROUTE_VERBOSE
1280                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1281                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1282                     net_ratelimit())
1283                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1284                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1285                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1286                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1287 #endif
1288         }
1289 out:
1290         in_dev_put(in_dev);
1291 }
1292
1293 static int ip_error(struct sk_buff *skb)
1294 {
1295         struct rtable *rt = (struct rtable*)skb->dst;
1296         unsigned long now;
1297         int code;
1298
1299         switch (rt->u.dst.error) {
1300                 case EINVAL:
1301                 default:
1302                         goto out;
1303                 case EHOSTUNREACH:
1304                         code = ICMP_HOST_UNREACH;
1305                         break;
1306                 case ENETUNREACH:
1307                         code = ICMP_NET_UNREACH;
1308                         break;
1309                 case EACCES:
1310                         code = ICMP_PKT_FILTERED;
1311                         break;
1312         }
1313
1314         now = jiffies;
1315         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1316         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1317                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1318         rt->u.dst.rate_last = now;
1319         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1320                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1321                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1322         }
1323
1324 out:    kfree_skb(skb);
1325         return 0;
1326
1327
1328 /*
1329  *      The last two values are not from the RFC but
1330  *      are needed for AMPRnet AX.25 paths.
1331  */
1332
1333 static unsigned short mtu_plateau[] =
1334 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1335
1336 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1337 {
1338         int i;
1339         
1340         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1341                 if (old_mtu > mtu_plateau[i])
1342                         return mtu_plateau[i];
1343         return 68;
1344 }
1345
1346 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1347 {
1348         int i;
1349         unsigned short old_mtu = ntohs(iph->tot_len);
1350         struct rtable *rth;
1351         u32  skeys[2] = { iph->saddr, 0, };
1352         u32  daddr = iph->daddr;
1353         u8   tos = iph->tos & IPTOS_RT_MASK;
1354         unsigned short est_mtu = 0;
1355
1356         if (ipv4_config.no_pmtu_disc)
1357                 return 0;
1358
1359         for (i = 0; i < 2; i++) {
1360                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1361
1362                 rcu_read_lock();
1363                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1364                      rth = rcu_dereference(rth->u.rt_next)) {
1365                         if (rth->fl.fl4_dst == daddr &&
1366                             rth->fl.fl4_src == skeys[i] &&
1367                             rth->rt_dst  == daddr &&
1368                             rth->rt_src  == iph->saddr &&
1369                             rth->fl.fl4_tos == tos &&
1370                             rth->fl.iif == 0 &&
1371                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1372                                 unsigned short mtu = new_mtu;
1373
1374                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1375
1376                                         /* BSD 4.2 compatibility hack :-( */
1377                                         if (mtu == 0 &&
1378                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1379                                             old_mtu >= 68 + (iph->ihl << 2))
1380                                                 old_mtu -= iph->ihl << 2;
1381
1382                                         mtu = guess_mtu(old_mtu);
1383                                 }
1384                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1385                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1386                                                 dst_confirm(&rth->u.dst);
1387                                                 if (mtu < ip_rt_min_pmtu) {
1388                                                         mtu = ip_rt_min_pmtu;
1389                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1390                                                                 (1 << RTAX_MTU);
1391                                                 }
1392                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1393                                                 dst_set_expires(&rth->u.dst,
1394                                                         ip_rt_mtu_expires);
1395                                         }
1396                                         est_mtu = mtu;
1397                                 }
1398                         }
1399                 }
1400                 rcu_read_unlock();
1401         }
1402         return est_mtu ? : new_mtu;
1403 }
1404
1405 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1406 {
1407         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1408             !(dst_metric_locked(dst, RTAX_MTU))) {
1409                 if (mtu < ip_rt_min_pmtu) {
1410                         mtu = ip_rt_min_pmtu;
1411                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1412                 }
1413                 dst->metrics[RTAX_MTU-1] = mtu;
1414                 dst_set_expires(dst, ip_rt_mtu_expires);
1415         }
1416 }
1417
1418 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1419 {
1420         return NULL;
1421 }
1422
1423 static void ipv4_dst_destroy(struct dst_entry *dst)
1424 {
1425         struct rtable *rt = (struct rtable *) dst;
1426         struct inet_peer *peer = rt->peer;
1427         struct in_device *idev = rt->idev;
1428
1429         if (peer) {
1430                 rt->peer = NULL;
1431                 inet_putpeer(peer);
1432         }
1433
1434         if (idev) {
1435                 rt->idev = NULL;
1436                 in_dev_put(idev);
1437         }
1438 }
1439
1440 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1441                             int how)
1442 {
1443         struct rtable *rt = (struct rtable *) dst;
1444         struct in_device *idev = rt->idev;
1445         if (dev != &loopback_dev && idev && idev->dev == dev) {
1446                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1447                 if (loopback_idev) {
1448                         rt->idev = loopback_idev;
1449                         in_dev_put(idev);
1450                 }
1451         }
1452 }
1453
1454 static void ipv4_link_failure(struct sk_buff *skb)
1455 {
1456         struct rtable *rt;
1457
1458         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1459
1460         rt = (struct rtable *) skb->dst;
1461         if (rt)
1462                 dst_set_expires(&rt->u.dst, 0);
1463 }
1464
1465 static int ip_rt_bug(struct sk_buff *skb)
1466 {
1467         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1468                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1469                 skb->dev ? skb->dev->name : "?");
1470         kfree_skb(skb);
1471         return 0;
1472 }
1473
1474 /*
1475    We do not cache source address of outgoing interface,
1476    because it is used only by IP RR, TS and SRR options,
1477    so that it out of fast path.
1478
1479    BTW remember: "addr" is allowed to be not aligned
1480    in IP options!
1481  */
1482
1483 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1484 {
1485         u32 src;
1486         struct fib_result res;
1487
1488         if (rt->fl.iif == 0)
1489                 src = rt->rt_src;
1490         else if (fib_lookup(&rt->fl, &res) == 0) {
1491                 src = FIB_RES_PREFSRC(res);
1492                 fib_res_put(&res);
1493         } else
1494                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1495                                         RT_SCOPE_UNIVERSE);
1496         memcpy(addr, &src, 4);
1497 }
1498
1499 #ifdef CONFIG_NET_CLS_ROUTE
1500 static void set_class_tag(struct rtable *rt, u32 tag)
1501 {
1502         if (!(rt->u.dst.tclassid & 0xFFFF))
1503                 rt->u.dst.tclassid |= tag & 0xFFFF;
1504         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1505                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1506 }
1507 #endif
1508
1509 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1510 {
1511         struct fib_info *fi = res->fi;
1512
1513         if (fi) {
1514                 if (FIB_RES_GW(*res) &&
1515                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1516                         rt->rt_gateway = FIB_RES_GW(*res);
1517                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1518                        sizeof(rt->u.dst.metrics));
1519                 if (fi->fib_mtu == 0) {
1520                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1521                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1522                             rt->rt_gateway != rt->rt_dst &&
1523                             rt->u.dst.dev->mtu > 576)
1524                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1525                 }
1526 #ifdef CONFIG_NET_CLS_ROUTE
1527                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1528 #endif
1529         } else
1530                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1531
1532         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1533                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1534         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1535                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1536         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1537                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1538                                        ip_rt_min_advmss);
1539         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1540                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1541
1542 #ifdef CONFIG_NET_CLS_ROUTE
1543 #ifdef CONFIG_IP_MULTIPLE_TABLES
1544         set_class_tag(rt, fib_rules_tclass(res));
1545 #endif
1546         set_class_tag(rt, itag);
1547 #endif
1548         rt->rt_type = res->type;
1549 }
1550
1551 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1552                                 u8 tos, struct net_device *dev, int our)
1553 {
1554         unsigned hash;
1555         struct rtable *rth;
1556         u32 spec_dst;
1557         struct in_device *in_dev = in_dev_get(dev);
1558         u32 itag = 0;
1559
1560         /* Primary sanity checks. */
1561
1562         if (in_dev == NULL)
1563                 return -EINVAL;
1564
1565         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1566             skb->protocol != htons(ETH_P_IP))
1567                 goto e_inval;
1568
1569         if (ZERONET(saddr)) {
1570                 if (!LOCAL_MCAST(daddr))
1571                         goto e_inval;
1572                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1573         } else if (fib_validate_source(saddr, 0, tos, 0,
1574                                         dev, &spec_dst, &itag) < 0)
1575                 goto e_inval;
1576
1577         rth = dst_alloc(&ipv4_dst_ops);
1578         if (!rth)
1579                 goto e_nobufs;
1580
1581         rth->u.dst.output= ip_rt_bug;
1582
1583         atomic_set(&rth->u.dst.__refcnt, 1);
1584         rth->u.dst.flags= DST_HOST;
1585         if (in_dev->cnf.no_policy)
1586                 rth->u.dst.flags |= DST_NOPOLICY;
1587         rth->fl.fl4_dst = daddr;
1588         rth->rt_dst     = daddr;
1589         rth->fl.fl4_tos = tos;
1590 #ifdef CONFIG_IP_ROUTE_FWMARK
1591         rth->fl.fl4_fwmark= skb->nfmark;
1592 #endif
1593         rth->fl.fl4_src = saddr;
1594         rth->rt_src     = saddr;
1595 #ifdef CONFIG_NET_CLS_ROUTE
1596         rth->u.dst.tclassid = itag;
1597 #endif
1598         rth->rt_iif     =
1599         rth->fl.iif     = dev->ifindex;
1600         rth->u.dst.dev  = &loopback_dev;
1601         dev_hold(rth->u.dst.dev);
1602         rth->idev       = in_dev_get(rth->u.dst.dev);
1603         rth->fl.oif     = 0;
1604         rth->rt_gateway = daddr;
1605         rth->rt_spec_dst= spec_dst;
1606         rth->rt_type    = RTN_MULTICAST;
1607         rth->rt_flags   = RTCF_MULTICAST;
1608         if (our) {
1609                 rth->u.dst.input= ip_local_deliver;
1610                 rth->rt_flags |= RTCF_LOCAL;
1611         }
1612
1613 #ifdef CONFIG_IP_MROUTE
1614         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1615                 rth->u.dst.input = ip_mr_input;
1616 #endif
1617         RT_CACHE_STAT_INC(in_slow_mc);
1618
1619         in_dev_put(in_dev);
1620         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1621         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1622
1623 e_nobufs:
1624         in_dev_put(in_dev);
1625         return -ENOBUFS;
1626
1627 e_inval:
1628         in_dev_put(in_dev);
1629         return -EINVAL;
1630 }
1631
1632
1633 static void ip_handle_martian_source(struct net_device *dev,
1634                                      struct in_device *in_dev,
1635                                      struct sk_buff *skb,
1636                                      u32 daddr,
1637                                      u32 saddr) 
1638 {
1639         RT_CACHE_STAT_INC(in_martian_src);
1640 #ifdef CONFIG_IP_ROUTE_VERBOSE
1641         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1642                 /*
1643                  *      RFC1812 recommendation, if source is martian,
1644                  *      the only hint is MAC header.
1645                  */
1646                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1647                         "%u.%u.%u.%u, on dev %s\n",
1648                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1649                 if (dev->hard_header_len) {
1650                         int i;
1651                         unsigned char *p = skb->mac.raw;
1652                         printk(KERN_WARNING "ll header: ");
1653                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1654                                 printk("%02x", *p);
1655                                 if (i < (dev->hard_header_len - 1))
1656                                         printk(":");
1657                         }
1658                         printk("\n");
1659                 }
1660         }
1661 #endif
1662 }
1663
1664 static inline int __mkroute_input(struct sk_buff *skb, 
1665                                   struct fib_result* res, 
1666                                   struct in_device *in_dev, 
1667                                   u32 daddr, u32 saddr, u32 tos, 
1668                                   struct rtable **result) 
1669 {
1670
1671         struct rtable *rth;
1672         int err;
1673         struct in_device *out_dev;
1674         unsigned flags = 0;
1675         u32 spec_dst, itag;
1676
1677         /* get a working reference to the output device */
1678         out_dev = in_dev_get(FIB_RES_DEV(*res));
1679         if (out_dev == NULL) {
1680                 if (net_ratelimit())
1681                         printk(KERN_CRIT "Bug in ip_route_input" \
1682                                "_slow(). Please, report\n");
1683                 return -EINVAL;
1684         }
1685
1686
1687         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1688                                   in_dev->dev, &spec_dst, &itag);
1689         if (err < 0) {
1690                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1691                                          saddr);
1692                 
1693                 err = -EINVAL;
1694                 goto cleanup;
1695         }
1696
1697         if (err)
1698                 flags |= RTCF_DIRECTSRC;
1699
1700         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1701             (IN_DEV_SHARED_MEDIA(out_dev) ||
1702              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1703                 flags |= RTCF_DOREDIRECT;
1704
1705         if (skb->protocol != htons(ETH_P_IP)) {
1706                 /* Not IP (i.e. ARP). Do not create route, if it is
1707                  * invalid for proxy arp. DNAT routes are always valid.
1708                  */
1709                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1710                         err = -EINVAL;
1711                         goto cleanup;
1712                 }
1713         }
1714
1715
1716         rth = dst_alloc(&ipv4_dst_ops);
1717         if (!rth) {
1718                 err = -ENOBUFS;
1719                 goto cleanup;
1720         }
1721
1722         rth->u.dst.flags= DST_HOST;
1723 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1724         if (res->fi->fib_nhs > 1)
1725                 rth->u.dst.flags |= DST_BALANCED;
1726 #endif
1727         if (in_dev->cnf.no_policy)
1728                 rth->u.dst.flags |= DST_NOPOLICY;
1729         if (in_dev->cnf.no_xfrm)
1730                 rth->u.dst.flags |= DST_NOXFRM;
1731         rth->fl.fl4_dst = daddr;
1732         rth->rt_dst     = daddr;
1733         rth->fl.fl4_tos = tos;
1734 #ifdef CONFIG_IP_ROUTE_FWMARK
1735         rth->fl.fl4_fwmark= skb->nfmark;
1736 #endif
1737         rth->fl.fl4_src = saddr;
1738         rth->rt_src     = saddr;
1739         rth->rt_gateway = daddr;
1740         rth->rt_iif     =
1741                 rth->fl.iif     = in_dev->dev->ifindex;
1742         rth->u.dst.dev  = (out_dev)->dev;
1743         dev_hold(rth->u.dst.dev);
1744         rth->idev       = in_dev_get(rth->u.dst.dev);
1745         rth->fl.oif     = 0;
1746         rth->rt_spec_dst= spec_dst;
1747
1748         rth->u.dst.input = ip_forward;
1749         rth->u.dst.output = ip_output;
1750
1751         rt_set_nexthop(rth, res, itag);
1752
1753         rth->rt_flags = flags;
1754
1755         *result = rth;
1756         err = 0;
1757  cleanup:
1758         /* release the working reference to the output device */
1759         in_dev_put(out_dev);
1760         return err;
1761 }                                               
1762
1763 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1764                                        struct fib_result* res, 
1765                                        const struct flowi *fl,
1766                                        struct in_device *in_dev,
1767                                        u32 daddr, u32 saddr, u32 tos)
1768 {
1769         struct rtable* rth;
1770         int err;
1771         unsigned hash;
1772
1773 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1774         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1775                 fib_select_multipath(fl, res);
1776 #endif
1777
1778         /* create a routing cache entry */
1779         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1780         if (err)
1781                 return err;
1782         atomic_set(&rth->u.dst.__refcnt, 1);
1783
1784         /* put it into the cache */
1785         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1786         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1787 }
1788
1789 static inline int ip_mkroute_input(struct sk_buff *skb, 
1790                                    struct fib_result* res, 
1791                                    const struct flowi *fl,
1792                                    struct in_device *in_dev,
1793                                    u32 daddr, u32 saddr, u32 tos)
1794 {
1795 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1796         struct rtable* rth;
1797         unsigned char hop, hopcount, lasthop;
1798         int err = -EINVAL;
1799         unsigned int hash;
1800
1801         if (res->fi)
1802                 hopcount = res->fi->fib_nhs;
1803         else
1804                 hopcount = 1;
1805
1806         lasthop = hopcount - 1;
1807
1808         /* distinguish between multipath and singlepath */
1809         if (hopcount < 2)
1810                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1811                                             saddr, tos);
1812         
1813         /* add all alternatives to the routing cache */
1814         for (hop = 0; hop < hopcount; hop++) {
1815                 res->nh_sel = hop;
1816
1817                 /* create a routing cache entry */
1818                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1819                                       &rth);
1820                 if (err)
1821                         return err;
1822
1823                 /* put it into the cache */
1824                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825                 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1826                 if (err)
1827                         return err;
1828
1829                 /* forward hop information to multipath impl. */
1830                 multipath_set_nhinfo(rth,
1831                                      FIB_RES_NETWORK(*res),
1832                                      FIB_RES_NETMASK(*res),
1833                                      res->prefixlen,
1834                                      &FIB_RES_NH(*res));
1835
1836                 /* only for the last hop the reference count is handled
1837                  * outside
1838                  */
1839                 if (hop == lasthop)
1840                         atomic_set(&(skb->dst->__refcnt), 1);
1841         }
1842         return err;
1843 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1844         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1845 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1846 }
1847
1848
1849 /*
1850  *      NOTE. We drop all the packets that has local source
1851  *      addresses, because every properly looped back packet
1852  *      must have correct destination already attached by output routine.
1853  *
1854  *      Such approach solves two big problems:
1855  *      1. Not simplex devices are handled properly.
1856  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1857  */
1858
1859 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1860                                u8 tos, struct net_device *dev)
1861 {
1862         struct fib_result res;
1863         struct in_device *in_dev = in_dev_get(dev);
1864         struct flowi fl = { .nl_u = { .ip4_u =
1865                                       { .daddr = daddr,
1866                                         .saddr = saddr,
1867                                         .tos = tos,
1868                                         .scope = RT_SCOPE_UNIVERSE,
1869 #ifdef CONFIG_IP_ROUTE_FWMARK
1870                                         .fwmark = skb->nfmark
1871 #endif
1872                                       } },
1873                             .iif = dev->ifindex };
1874         unsigned        flags = 0;
1875         u32             itag = 0;
1876         struct rtable * rth;
1877         unsigned        hash;
1878         u32             spec_dst;
1879         int             err = -EINVAL;
1880         int             free_res = 0;
1881
1882         /* IP on this device is disabled. */
1883
1884         if (!in_dev)
1885                 goto out;
1886
1887         /* Check for the most weird martians, which can be not detected
1888            by fib_lookup.
1889          */
1890
1891         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1892                 goto martian_source;
1893
1894         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1895                 goto brd_input;
1896
1897         /* Accept zero addresses only to limited broadcast;
1898          * I even do not know to fix it or not. Waiting for complains :-)
1899          */
1900         if (ZERONET(saddr))
1901                 goto martian_source;
1902
1903         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1904                 goto martian_destination;
1905
1906         /*
1907          *      Now we are ready to route packet.
1908          */
1909         if ((err = fib_lookup(&fl, &res)) != 0) {
1910                 if (!IN_DEV_FORWARD(in_dev))
1911                         goto e_inval;
1912                 goto no_route;
1913         }
1914         free_res = 1;
1915
1916         RT_CACHE_STAT_INC(in_slow_tot);
1917
1918         if (res.type == RTN_BROADCAST)
1919                 goto brd_input;
1920
1921         if (res.type == RTN_LOCAL) {
1922                 int result;
1923                 result = fib_validate_source(saddr, daddr, tos,
1924                                              loopback_dev.ifindex,
1925                                              dev, &spec_dst, &itag);
1926                 if (result < 0)
1927                         goto martian_source;
1928                 if (result)
1929                         flags |= RTCF_DIRECTSRC;
1930                 spec_dst = daddr;
1931                 goto local_input;
1932         }
1933
1934         if (!IN_DEV_FORWARD(in_dev))
1935                 goto e_inval;
1936         if (res.type != RTN_UNICAST)
1937                 goto martian_destination;
1938
1939         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1940         if (err == -ENOBUFS)
1941                 goto e_nobufs;
1942         if (err == -EINVAL)
1943                 goto e_inval;
1944         
1945 done:
1946         in_dev_put(in_dev);
1947         if (free_res)
1948                 fib_res_put(&res);
1949 out:    return err;
1950
1951 brd_input:
1952         if (skb->protocol != htons(ETH_P_IP))
1953                 goto e_inval;
1954
1955         if (ZERONET(saddr))
1956                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1957         else {
1958                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1959                                           &itag);
1960                 if (err < 0)
1961                         goto martian_source;
1962                 if (err)
1963                         flags |= RTCF_DIRECTSRC;
1964         }
1965         flags |= RTCF_BROADCAST;
1966         res.type = RTN_BROADCAST;
1967         RT_CACHE_STAT_INC(in_brd);
1968
1969 local_input:
1970         rth = dst_alloc(&ipv4_dst_ops);
1971         if (!rth)
1972                 goto e_nobufs;
1973
1974         rth->u.dst.output= ip_rt_bug;
1975
1976         atomic_set(&rth->u.dst.__refcnt, 1);
1977         rth->u.dst.flags= DST_HOST;
1978         if (in_dev->cnf.no_policy)
1979                 rth->u.dst.flags |= DST_NOPOLICY;
1980         rth->fl.fl4_dst = daddr;
1981         rth->rt_dst     = daddr;
1982         rth->fl.fl4_tos = tos;
1983 #ifdef CONFIG_IP_ROUTE_FWMARK
1984         rth->fl.fl4_fwmark= skb->nfmark;
1985 #endif
1986         rth->fl.fl4_src = saddr;
1987         rth->rt_src     = saddr;
1988 #ifdef CONFIG_NET_CLS_ROUTE
1989         rth->u.dst.tclassid = itag;
1990 #endif
1991         rth->rt_iif     =
1992         rth->fl.iif     = dev->ifindex;
1993         rth->u.dst.dev  = &loopback_dev;
1994         dev_hold(rth->u.dst.dev);
1995         rth->idev       = in_dev_get(rth->u.dst.dev);
1996         rth->rt_gateway = daddr;
1997         rth->rt_spec_dst= spec_dst;
1998         rth->u.dst.input= ip_local_deliver;
1999         rth->rt_flags   = flags|RTCF_LOCAL;
2000         if (res.type == RTN_UNREACHABLE) {
2001                 rth->u.dst.input= ip_error;
2002                 rth->u.dst.error= -err;
2003                 rth->rt_flags   &= ~RTCF_LOCAL;
2004         }
2005         rth->rt_type    = res.type;
2006         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2007         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2008         goto done;
2009
2010 no_route:
2011         RT_CACHE_STAT_INC(in_no_route);
2012         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2013         res.type = RTN_UNREACHABLE;
2014         goto local_input;
2015
2016         /*
2017          *      Do not cache martian addresses: they should be logged (RFC1812)
2018          */
2019 martian_destination:
2020         RT_CACHE_STAT_INC(in_martian_dst);
2021 #ifdef CONFIG_IP_ROUTE_VERBOSE
2022         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2023                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2024                         "%u.%u.%u.%u, dev %s\n",
2025                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2026 #endif
2027 e_inval:
2028         err = -EINVAL;
2029         goto done;
2030
2031 e_nobufs:
2032         err = -ENOBUFS;
2033         goto done;
2034
2035 martian_source:
2036         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2037         goto e_inval;
2038 }
2039
2040 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2041                    u8 tos, struct net_device *dev)
2042 {
2043         struct rtable * rth;
2044         unsigned        hash;
2045         int iif = dev->ifindex;
2046
2047         tos &= IPTOS_RT_MASK;
2048         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2049
2050         rcu_read_lock();
2051         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2052              rth = rcu_dereference(rth->u.rt_next)) {
2053                 if (rth->fl.fl4_dst == daddr &&
2054                     rth->fl.fl4_src == saddr &&
2055                     rth->fl.iif == iif &&
2056                     rth->fl.oif == 0 &&
2057 #ifdef CONFIG_IP_ROUTE_FWMARK
2058                     rth->fl.fl4_fwmark == skb->nfmark &&
2059 #endif
2060                     rth->fl.fl4_tos == tos) {
2061                         rth->u.dst.lastuse = jiffies;
2062                         dst_hold(&rth->u.dst);
2063                         rth->u.dst.__use++;
2064                         RT_CACHE_STAT_INC(in_hit);
2065                         rcu_read_unlock();
2066                         skb->dst = (struct dst_entry*)rth;
2067                         return 0;
2068                 }
2069                 RT_CACHE_STAT_INC(in_hlist_search);
2070         }
2071         rcu_read_unlock();
2072
2073         /* Multicast recognition logic is moved from route cache to here.
2074            The problem was that too many Ethernet cards have broken/missing
2075            hardware multicast filters :-( As result the host on multicasting
2076            network acquires a lot of useless route cache entries, sort of
2077            SDR messages from all the world. Now we try to get rid of them.
2078            Really, provided software IP multicast filter is organized
2079            reasonably (at least, hashed), it does not result in a slowdown
2080            comparing with route cache reject entries.
2081            Note, that multicast routers are not affected, because
2082            route cache entry is created eventually.
2083          */
2084         if (MULTICAST(daddr)) {
2085                 struct in_device *in_dev;
2086
2087                 rcu_read_lock();
2088                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2089                         int our = ip_check_mc(in_dev, daddr, saddr,
2090                                 skb->nh.iph->protocol);
2091                         if (our
2092 #ifdef CONFIG_IP_MROUTE
2093                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2094 #endif
2095                             ) {
2096                                 rcu_read_unlock();
2097                                 return ip_route_input_mc(skb, daddr, saddr,
2098                                                          tos, dev, our);
2099                         }
2100                 }
2101                 rcu_read_unlock();
2102                 return -EINVAL;
2103         }
2104         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2105 }
2106
2107 static inline int __mkroute_output(struct rtable **result,
2108                                    struct fib_result* res, 
2109                                    const struct flowi *fl,
2110                                    const struct flowi *oldflp, 
2111                                    struct net_device *dev_out, 
2112                                    unsigned flags) 
2113 {
2114         struct rtable *rth;
2115         struct in_device *in_dev;
2116         u32 tos = RT_FL_TOS(oldflp);
2117         int err = 0;
2118
2119         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2120                 return -EINVAL;
2121
2122         if (fl->fl4_dst == 0xFFFFFFFF)
2123                 res->type = RTN_BROADCAST;
2124         else if (MULTICAST(fl->fl4_dst))
2125                 res->type = RTN_MULTICAST;
2126         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2127                 return -EINVAL;
2128
2129         if (dev_out->flags & IFF_LOOPBACK)
2130                 flags |= RTCF_LOCAL;
2131
2132         /* get work reference to inet device */
2133         in_dev = in_dev_get(dev_out);
2134         if (!in_dev)
2135                 return -EINVAL;
2136
2137         if (res->type == RTN_BROADCAST) {
2138                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2139                 if (res->fi) {
2140                         fib_info_put(res->fi);
2141                         res->fi = NULL;
2142                 }
2143         } else if (res->type == RTN_MULTICAST) {
2144                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2145                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2146                                  oldflp->proto))
2147                         flags &= ~RTCF_LOCAL;
2148                 /* If multicast route do not exist use
2149                    default one, but do not gateway in this case.
2150                    Yes, it is hack.
2151                  */
2152                 if (res->fi && res->prefixlen < 4) {
2153                         fib_info_put(res->fi);
2154                         res->fi = NULL;
2155                 }
2156         }
2157
2158
2159         rth = dst_alloc(&ipv4_dst_ops);
2160         if (!rth) {
2161                 err = -ENOBUFS;
2162                 goto cleanup;
2163         }               
2164
2165         rth->u.dst.flags= DST_HOST;
2166 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2167         if (res->fi) {
2168                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2169                 if (res->fi->fib_nhs > 1)
2170                         rth->u.dst.flags |= DST_BALANCED;
2171         }
2172 #endif
2173         if (in_dev->cnf.no_xfrm)
2174                 rth->u.dst.flags |= DST_NOXFRM;
2175         if (in_dev->cnf.no_policy)
2176                 rth->u.dst.flags |= DST_NOPOLICY;
2177
2178         rth->fl.fl4_dst = oldflp->fl4_dst;
2179         rth->fl.fl4_tos = tos;
2180         rth->fl.fl4_src = oldflp->fl4_src;
2181         rth->fl.oif     = oldflp->oif;
2182 #ifdef CONFIG_IP_ROUTE_FWMARK
2183         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2184 #endif
2185         rth->rt_dst     = fl->fl4_dst;
2186         rth->rt_src     = fl->fl4_src;
2187         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2188         /* get references to the devices that are to be hold by the routing 
2189            cache entry */
2190         rth->u.dst.dev  = dev_out;
2191         dev_hold(dev_out);
2192         rth->idev       = in_dev_get(dev_out);
2193         rth->rt_gateway = fl->fl4_dst;
2194         rth->rt_spec_dst= fl->fl4_src;
2195
2196         rth->u.dst.output=ip_output;
2197
2198         RT_CACHE_STAT_INC(out_slow_tot);
2199
2200         if (flags & RTCF_LOCAL) {
2201                 rth->u.dst.input = ip_local_deliver;
2202                 rth->rt_spec_dst = fl->fl4_dst;
2203         }
2204         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2205                 rth->rt_spec_dst = fl->fl4_src;
2206                 if (flags & RTCF_LOCAL && 
2207                     !(dev_out->flags & IFF_LOOPBACK)) {
2208                         rth->u.dst.output = ip_mc_output;
2209                         RT_CACHE_STAT_INC(out_slow_mc);
2210                 }
2211 #ifdef CONFIG_IP_MROUTE
2212                 if (res->type == RTN_MULTICAST) {
2213                         if (IN_DEV_MFORWARD(in_dev) &&
2214                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2215                                 rth->u.dst.input = ip_mr_input;
2216                                 rth->u.dst.output = ip_mc_output;
2217                         }
2218                 }
2219 #endif
2220         }
2221
2222         rt_set_nexthop(rth, res, 0);
2223
2224         rth->rt_flags = flags;
2225
2226         *result = rth;
2227  cleanup:
2228         /* release work reference to inet device */
2229         in_dev_put(in_dev);
2230
2231         return err;
2232 }
2233
2234 static inline int ip_mkroute_output_def(struct rtable **rp,
2235                                         struct fib_result* res,
2236                                         const struct flowi *fl,
2237                                         const struct flowi *oldflp,
2238                                         struct net_device *dev_out,
2239                                         unsigned flags)
2240 {
2241         struct rtable *rth;
2242         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2243         unsigned hash;
2244         if (err == 0) {
2245                 u32 tos = RT_FL_TOS(oldflp);
2246
2247                 atomic_set(&rth->u.dst.__refcnt, 1);
2248                 
2249                 hash = rt_hash_code(oldflp->fl4_dst, 
2250                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2251                 err = rt_intern_hash(hash, rth, rp);
2252         }
2253         
2254         return err;
2255 }
2256
2257 static inline int ip_mkroute_output(struct rtable** rp,
2258                                     struct fib_result* res,
2259                                     const struct flowi *fl,
2260                                     const struct flowi *oldflp,
2261                                     struct net_device *dev_out,
2262                                     unsigned flags)
2263 {
2264 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2265         u32 tos = RT_FL_TOS(oldflp);
2266         unsigned char hop;
2267         unsigned hash;
2268         int err = -EINVAL;
2269         struct rtable *rth;
2270
2271         if (res->fi && res->fi->fib_nhs > 1) {
2272                 unsigned char hopcount = res->fi->fib_nhs;
2273
2274                 for (hop = 0; hop < hopcount; hop++) {
2275                         struct net_device *dev2nexthop;
2276
2277                         res->nh_sel = hop;
2278
2279                         /* hold a work reference to the output device */
2280                         dev2nexthop = FIB_RES_DEV(*res);
2281                         dev_hold(dev2nexthop);
2282
2283                         err = __mkroute_output(&rth, res, fl, oldflp,
2284                                                dev2nexthop, flags);
2285
2286                         if (err != 0)
2287                                 goto cleanup;
2288
2289                         hash = rt_hash_code(oldflp->fl4_dst, 
2290                                             oldflp->fl4_src ^
2291                                             (oldflp->oif << 5), tos);
2292                         err = rt_intern_hash(hash, rth, rp);
2293
2294                         /* forward hop information to multipath impl. */
2295                         multipath_set_nhinfo(rth,
2296                                              FIB_RES_NETWORK(*res),
2297                                              FIB_RES_NETMASK(*res),
2298                                              res->prefixlen,
2299                                              &FIB_RES_NH(*res));
2300                 cleanup:
2301                         /* release work reference to output device */
2302                         dev_put(dev2nexthop);
2303
2304                         if (err != 0)
2305                                 return err;
2306                 }
2307                 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2308                 return err;
2309         } else {
2310                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2311                                              flags);
2312         }
2313 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2314         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2315 #endif
2316 }
2317
2318 /*
2319  * Major route resolver routine.
2320  */
2321
2322 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2323 {
2324         u32 tos = RT_FL_TOS(oldflp);
2325         struct flowi fl = { .nl_u = { .ip4_u =
2326                                       { .daddr = oldflp->fl4_dst,
2327                                         .saddr = oldflp->fl4_src,
2328                                         .tos = tos & IPTOS_RT_MASK,
2329                                         .scope = ((tos & RTO_ONLINK) ?
2330                                                   RT_SCOPE_LINK :
2331                                                   RT_SCOPE_UNIVERSE),
2332 #ifdef CONFIG_IP_ROUTE_FWMARK
2333                                         .fwmark = oldflp->fl4_fwmark
2334 #endif
2335                                       } },
2336                             .iif = loopback_dev.ifindex,
2337                             .oif = oldflp->oif };
2338         struct fib_result res;
2339         unsigned flags = 0;
2340         struct net_device *dev_out = NULL;
2341         int free_res = 0;
2342         int err;
2343
2344
2345         res.fi          = NULL;
2346 #ifdef CONFIG_IP_MULTIPLE_TABLES
2347         res.r           = NULL;
2348 #endif
2349
2350         if (oldflp->fl4_src) {
2351                 err = -EINVAL;
2352                 if (MULTICAST(oldflp->fl4_src) ||
2353                     BADCLASS(oldflp->fl4_src) ||
2354                     ZERONET(oldflp->fl4_src))
2355                         goto out;
2356
2357                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2358                 dev_out = ip_dev_find(oldflp->fl4_src);
2359                 if (dev_out == NULL)
2360                         goto out;
2361
2362                 /* I removed check for oif == dev_out->oif here.
2363                    It was wrong for two reasons:
2364                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2365                       assigned to multiple interfaces.
2366                    2. Moreover, we are allowed to send packets with saddr
2367                       of another iface. --ANK
2368                  */
2369
2370                 if (oldflp->oif == 0
2371                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2372                         /* Special hack: user can direct multicasts
2373                            and limited broadcast via necessary interface
2374                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2375                            This hack is not just for fun, it allows
2376                            vic,vat and friends to work.
2377                            They bind socket to loopback, set ttl to zero
2378                            and expect that it will work.
2379                            From the viewpoint of routing cache they are broken,
2380                            because we are not allowed to build multicast path
2381                            with loopback source addr (look, routing cache
2382                            cannot know, that ttl is zero, so that packet
2383                            will not leave this host and route is valid).
2384                            Luckily, this hack is good workaround.
2385                          */
2386
2387                         fl.oif = dev_out->ifindex;
2388                         goto make_route;
2389                 }
2390                 if (dev_out)
2391                         dev_put(dev_out);
2392                 dev_out = NULL;
2393         }
2394
2395
2396         if (oldflp->oif) {
2397                 dev_out = dev_get_by_index(oldflp->oif);
2398                 err = -ENODEV;
2399                 if (dev_out == NULL)
2400                         goto out;
2401                 if (__in_dev_get(dev_out) == NULL) {
2402                         dev_put(dev_out);
2403                         goto out;       /* Wrong error code */
2404                 }
2405
2406                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2407                         if (!fl.fl4_src)
2408                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2409                                                               RT_SCOPE_LINK);
2410                         goto make_route;
2411                 }
2412                 if (!fl.fl4_src) {
2413                         if (MULTICAST(oldflp->fl4_dst))
2414                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2415                                                               fl.fl4_scope);
2416                         else if (!oldflp->fl4_dst)
2417                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2418                                                               RT_SCOPE_HOST);
2419                 }
2420         }
2421
2422         if (!fl.fl4_dst) {
2423                 fl.fl4_dst = fl.fl4_src;
2424                 if (!fl.fl4_dst)
2425                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2426                 if (dev_out)
2427                         dev_put(dev_out);
2428                 dev_out = &loopback_dev;
2429                 dev_hold(dev_out);
2430                 fl.oif = loopback_dev.ifindex;
2431                 res.type = RTN_LOCAL;
2432                 flags |= RTCF_LOCAL;
2433                 goto make_route;
2434         }
2435
2436         if (fib_lookup(&fl, &res)) {
2437                 res.fi = NULL;
2438                 if (oldflp->oif) {
2439                         /* Apparently, routing tables are wrong. Assume,
2440                            that the destination is on link.
2441
2442                            WHY? DW.
2443                            Because we are allowed to send to iface
2444                            even if it has NO routes and NO assigned
2445                            addresses. When oif is specified, routing
2446                            tables are looked up with only one purpose:
2447                            to catch if destination is gatewayed, rather than
2448                            direct. Moreover, if MSG_DONTROUTE is set,
2449                            we send packet, ignoring both routing tables
2450                            and ifaddr state. --ANK
2451
2452
2453                            We could make it even if oif is unknown,
2454                            likely IPv6, but we do not.
2455                          */
2456
2457                         if (fl.fl4_src == 0)
2458                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2459                                                               RT_SCOPE_LINK);
2460                         res.type = RTN_UNICAST;
2461                         goto make_route;
2462                 }
2463                 if (dev_out)
2464                         dev_put(dev_out);
2465                 err = -ENETUNREACH;
2466                 goto out;
2467         }
2468         free_res = 1;
2469
2470         if (res.type == RTN_LOCAL) {
2471                 if (!fl.fl4_src)
2472                         fl.fl4_src = fl.fl4_dst;
2473                 if (dev_out)
2474                         dev_put(dev_out);
2475                 dev_out = &loopback_dev;
2476                 dev_hold(dev_out);
2477                 fl.oif = dev_out->ifindex;
2478                 if (res.fi)
2479                         fib_info_put(res.fi);
2480                 res.fi = NULL;
2481                 flags |= RTCF_LOCAL;
2482                 goto make_route;
2483         }
2484
2485 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2486         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2487                 fib_select_multipath(&fl, &res);
2488         else
2489 #endif
2490         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2491                 fib_select_default(&fl, &res);
2492
2493         if (!fl.fl4_src)
2494                 fl.fl4_src = FIB_RES_PREFSRC(res);
2495
2496         if (dev_out)
2497                 dev_put(dev_out);
2498         dev_out = FIB_RES_DEV(res);
2499         dev_hold(dev_out);
2500         fl.oif = dev_out->ifindex;
2501
2502
2503 make_route:
2504         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2505
2506
2507         if (free_res)
2508                 fib_res_put(&res);
2509         if (dev_out)
2510                 dev_put(dev_out);
2511 out:    return err;
2512 }
2513
2514 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2515 {
2516         unsigned hash;
2517         struct rtable *rth;
2518
2519         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2520
2521         rcu_read_lock_bh();
2522         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2523                 rth = rcu_dereference(rth->u.rt_next)) {
2524                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2525                     rth->fl.fl4_src == flp->fl4_src &&
2526                     rth->fl.iif == 0 &&
2527                     rth->fl.oif == flp->oif &&
2528 #ifdef CONFIG_IP_ROUTE_FWMARK
2529                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2530 #endif
2531                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2532                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2533
2534                         /* check for multipath routes and choose one if
2535                          * necessary
2536                          */
2537                         if (multipath_select_route(flp, rth, rp)) {
2538                                 dst_hold(&(*rp)->u.dst);
2539                                 RT_CACHE_STAT_INC(out_hit);
2540                                 rcu_read_unlock_bh();
2541                                 return 0;
2542                         }
2543
2544                         rth->u.dst.lastuse = jiffies;
2545                         dst_hold(&rth->u.dst);
2546                         rth->u.dst.__use++;
2547                         RT_CACHE_STAT_INC(out_hit);
2548                         rcu_read_unlock_bh();
2549                         *rp = rth;
2550                         return 0;
2551                 }
2552                 RT_CACHE_STAT_INC(out_hlist_search);
2553         }
2554         rcu_read_unlock_bh();
2555
2556         return ip_route_output_slow(rp, flp);
2557 }
2558
2559 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2560 {
2561         int err;
2562
2563         if ((err = __ip_route_output_key(rp, flp)) != 0)
2564                 return err;
2565
2566         if (flp->proto) {
2567                 if (!flp->fl4_src)
2568                         flp->fl4_src = (*rp)->rt_src;
2569                 if (!flp->fl4_dst)
2570                         flp->fl4_dst = (*rp)->rt_dst;
2571                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2572         }
2573
2574         return 0;
2575 }
2576
2577 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2578 {
2579         return ip_route_output_flow(rp, flp, NULL, 0);
2580 }
2581
2582 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2583                         int nowait)
2584 {
2585         struct rtable *rt = (struct rtable*)skb->dst;
2586         struct rtmsg *r;
2587         struct nlmsghdr  *nlh;
2588         unsigned char    *b = skb->tail;
2589         struct rta_cacheinfo ci;
2590 #ifdef CONFIG_IP_MROUTE
2591         struct rtattr *eptr;
2592 #endif
2593         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2594         r = NLMSG_DATA(nlh);
2595         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2596         r->rtm_family    = AF_INET;
2597         r->rtm_dst_len  = 32;
2598         r->rtm_src_len  = 0;
2599         r->rtm_tos      = rt->fl.fl4_tos;
2600         r->rtm_table    = RT_TABLE_MAIN;
2601         r->rtm_type     = rt->rt_type;
2602         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2603         r->rtm_protocol = RTPROT_UNSPEC;
2604         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2605         if (rt->rt_flags & RTCF_NOTIFY)
2606                 r->rtm_flags |= RTM_F_NOTIFY;
2607         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2608         if (rt->fl.fl4_src) {
2609                 r->rtm_src_len = 32;
2610                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2611         }
2612         if (rt->u.dst.dev)
2613                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2614 #ifdef CONFIG_NET_CLS_ROUTE
2615         if (rt->u.dst.tclassid)
2616                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2617 #endif
2618 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2619         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2620                 __u32 alg = rt->rt_multipath_alg;
2621
2622                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2623         }
2624 #endif
2625         if (rt->fl.iif)
2626                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2627         else if (rt->rt_src != rt->fl.fl4_src)
2628                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2629         if (rt->rt_dst != rt->rt_gateway)
2630                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2631         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2632                 goto rtattr_failure;
2633         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2634         ci.rta_used     = rt->u.dst.__use;
2635         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2636         if (rt->u.dst.expires)
2637                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2638         else
2639                 ci.rta_expires = 0;
2640         ci.rta_error    = rt->u.dst.error;
2641         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2642         if (rt->peer) {
2643                 ci.rta_id = rt->peer->ip_id_count;
2644                 if (rt->peer->tcp_ts_stamp) {
2645                         ci.rta_ts = rt->peer->tcp_ts;
2646                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2647                 }
2648         }
2649 #ifdef CONFIG_IP_MROUTE
2650         eptr = (struct rtattr*)skb->tail;
2651 #endif
2652         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2653         if (rt->fl.iif) {
2654 #ifdef CONFIG_IP_MROUTE
2655                 u32 dst = rt->rt_dst;
2656
2657                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2658                     ipv4_devconf.mc_forwarding) {
2659                         int err = ipmr_get_route(skb, r, nowait);
2660                         if (err <= 0) {
2661                                 if (!nowait) {
2662                                         if (err == 0)
2663                                                 return 0;
2664                                         goto nlmsg_failure;
2665                                 } else {
2666                                         if (err == -EMSGSIZE)
2667                                                 goto nlmsg_failure;
2668                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2669                                 }
2670                         }
2671                 } else
2672 #endif
2673                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2674         }
2675
2676         nlh->nlmsg_len = skb->tail - b;
2677         return skb->len;
2678
2679 nlmsg_failure:
2680 rtattr_failure:
2681         skb_trim(skb, b - skb->data);
2682         return -1;
2683 }
2684
2685 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2686 {
2687         struct rtattr **rta = arg;
2688         struct rtmsg *rtm = NLMSG_DATA(nlh);
2689         struct rtable *rt = NULL;
2690         u32 dst = 0;
2691         u32 src = 0;
2692         int iif = 0;
2693         int err = -ENOBUFS;
2694         struct sk_buff *skb;
2695
2696         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2697         if (!skb)
2698                 goto out;
2699
2700         /* Reserve room for dummy headers, this skb can pass
2701            through good chunk of routing engine.
2702          */
2703         skb->mac.raw = skb->data;
2704         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2705
2706         if (rta[RTA_SRC - 1])
2707                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2708         if (rta[RTA_DST - 1])
2709                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2710         if (rta[RTA_IIF - 1])
2711                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2712
2713         if (iif) {
2714                 struct net_device *dev = __dev_get_by_index(iif);
2715                 err = -ENODEV;
2716                 if (!dev)
2717                         goto out_free;
2718                 skb->protocol   = htons(ETH_P_IP);
2719                 skb->dev        = dev;
2720                 local_bh_disable();
2721                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2722                 local_bh_enable();
2723                 rt = (struct rtable*)skb->dst;
2724                 if (!err && rt->u.dst.error)
2725                         err = -rt->u.dst.error;
2726         } else {
2727                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2728                                                          .saddr = src,
2729                                                          .tos = rtm->rtm_tos } } };
2730                 int oif = 0;
2731                 if (rta[RTA_OIF - 1])
2732                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2733                 fl.oif = oif;
2734                 err = ip_route_output_key(&rt, &fl);
2735         }
2736         if (err)
2737                 goto out_free;
2738
2739         skb->dst = &rt->u.dst;
2740         if (rtm->rtm_flags & RTM_F_NOTIFY)
2741                 rt->rt_flags |= RTCF_NOTIFY;
2742
2743         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2744
2745         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2746                                 RTM_NEWROUTE, 0);
2747         if (!err)
2748                 goto out_free;
2749         if (err < 0) {
2750                 err = -EMSGSIZE;
2751                 goto out_free;
2752         }
2753
2754         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2755         if (err > 0)
2756                 err = 0;
2757 out:    return err;
2758
2759 out_free:
2760         kfree_skb(skb);
2761         goto out;
2762 }
2763
2764 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2765 {
2766         struct rtable *rt;
2767         int h, s_h;
2768         int idx, s_idx;
2769
2770         s_h = cb->args[0];
2771         s_idx = idx = cb->args[1];
2772         for (h = 0; h <= rt_hash_mask; h++) {
2773                 if (h < s_h) continue;
2774                 if (h > s_h)
2775                         s_idx = 0;
2776                 rcu_read_lock_bh();
2777                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2778                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2779                         if (idx < s_idx)
2780                                 continue;
2781                         skb->dst = dst_clone(&rt->u.dst);
2782                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2783                                          cb->nlh->nlmsg_seq,
2784                                          RTM_NEWROUTE, 1) <= 0) {
2785                                 dst_release(xchg(&skb->dst, NULL));
2786                                 rcu_read_unlock_bh();
2787                                 goto done;
2788                         }
2789                         dst_release(xchg(&skb->dst, NULL));
2790                 }
2791                 rcu_read_unlock_bh();
2792         }
2793
2794 done:
2795         cb->args[0] = h;
2796         cb->args[1] = idx;
2797         return skb->len;
2798 }
2799
2800 void ip_rt_multicast_event(struct in_device *in_dev)
2801 {
2802         rt_cache_flush(0);
2803 }
2804
2805 #ifdef CONFIG_SYSCTL
2806 static int flush_delay;
2807
2808 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2809                                         struct file *filp, void __user *buffer,
2810                                         size_t *lenp, loff_t *ppos)
2811 {
2812         if (write) {
2813                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2814                 rt_cache_flush(flush_delay);
2815                 return 0;
2816         } 
2817
2818         return -EINVAL;
2819 }
2820
2821 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2822                                                 int __user *name,
2823                                                 int nlen,
2824                                                 void __user *oldval,
2825                                                 size_t __user *oldlenp,
2826                                                 void __user *newval,
2827                                                 size_t newlen,
2828                                                 void **context)
2829 {
2830         int delay;
2831         if (newlen != sizeof(int))
2832                 return -EINVAL;
2833         if (get_user(delay, (int __user *)newval))
2834                 return -EFAULT; 
2835         rt_cache_flush(delay); 
2836         return 0;
2837 }
2838
2839 ctl_table ipv4_route_table[] = {
2840         {
2841                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2842                 .procname       = "flush",
2843                 .data           = &flush_delay,
2844                 .maxlen         = sizeof(int),
2845                 .mode           = 0644,
2846                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2847                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2848         },
2849         {
2850                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2851                 .procname       = "min_delay",
2852                 .data           = &ip_rt_min_delay,
2853                 .maxlen         = sizeof(int),
2854                 .mode           = 0644,
2855                 .proc_handler   = &proc_dointvec_jiffies,
2856                 .strategy       = &sysctl_jiffies,
2857         },
2858         {
2859                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2860                 .procname       = "max_delay",
2861                 .data           = &ip_rt_max_delay,
2862                 .maxlen         = sizeof(int),
2863                 .mode           = 0644,
2864                 .proc_handler   = &proc_dointvec_jiffies,
2865                 .strategy       = &sysctl_jiffies,
2866         },
2867         {
2868                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2869                 .procname       = "gc_thresh",
2870                 .data           = &ipv4_dst_ops.gc_thresh,
2871                 .maxlen         = sizeof(int),
2872                 .mode           = 0644,
2873                 .proc_handler   = &proc_dointvec,
2874         },
2875         {
2876                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2877                 .procname       = "max_size",
2878                 .data           = &ip_rt_max_size,
2879                 .maxlen         = sizeof(int),
2880                 .mode           = 0644,
2881                 .proc_handler   = &proc_dointvec,
2882         },
2883         {
2884                 /*  Deprecated. Use gc_min_interval_ms */
2885  
2886                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2887                 .procname       = "gc_min_interval",
2888                 .data           = &ip_rt_gc_min_interval,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = &proc_dointvec_jiffies,
2892                 .strategy       = &sysctl_jiffies,
2893         },
2894         {
2895                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2896                 .procname       = "gc_min_interval_ms",
2897                 .data           = &ip_rt_gc_min_interval,
2898                 .maxlen         = sizeof(int),
2899                 .mode           = 0644,
2900                 .proc_handler   = &proc_dointvec_ms_jiffies,
2901                 .strategy       = &sysctl_ms_jiffies,
2902         },
2903         {
2904                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2905                 .procname       = "gc_timeout",
2906                 .data           = &ip_rt_gc_timeout,
2907                 .maxlen         = sizeof(int),
2908                 .mode           = 0644,
2909                 .proc_handler   = &proc_dointvec_jiffies,
2910                 .strategy       = &sysctl_jiffies,
2911         },
2912         {
2913                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2914                 .procname       = "gc_interval",
2915                 .data           = &ip_rt_gc_interval,
2916                 .maxlen         = sizeof(int),
2917                 .mode           = 0644,
2918                 .proc_handler   = &proc_dointvec_jiffies,
2919                 .strategy       = &sysctl_jiffies,
2920         },
2921         {
2922                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2923                 .procname       = "redirect_load",
2924                 .data           = &ip_rt_redirect_load,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = &proc_dointvec,
2928         },
2929         {
2930                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2931                 .procname       = "redirect_number",
2932                 .data           = &ip_rt_redirect_number,
2933                 .maxlen         = sizeof(int),
2934                 .mode           = 0644,
2935                 .proc_handler   = &proc_dointvec,
2936         },
2937         {
2938                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2939                 .procname       = "redirect_silence",
2940                 .data           = &ip_rt_redirect_silence,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = &proc_dointvec,
2944         },
2945         {
2946                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2947                 .procname       = "error_cost",
2948                 .data           = &ip_rt_error_cost,
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0644,
2951                 .proc_handler   = &proc_dointvec,
2952         },
2953         {
2954                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2955                 .procname       = "error_burst",
2956                 .data           = &ip_rt_error_burst,
2957                 .maxlen         = sizeof(int),
2958                 .mode           = 0644,
2959                 .proc_handler   = &proc_dointvec,
2960         },
2961         {
2962                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2963                 .procname       = "gc_elasticity",
2964                 .data           = &ip_rt_gc_elasticity,
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0644,
2967                 .proc_handler   = &proc_dointvec,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2971                 .procname       = "mtu_expires",
2972                 .data           = &ip_rt_mtu_expires,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec_jiffies,
2976                 .strategy       = &sysctl_jiffies,
2977         },
2978         {
2979                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2980                 .procname       = "min_pmtu",
2981                 .data           = &ip_rt_min_pmtu,
2982                 .maxlen         = sizeof(int),
2983                 .mode           = 0644,
2984                 .proc_handler   = &proc_dointvec,
2985         },
2986         {
2987                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2988                 .procname       = "min_adv_mss",
2989                 .data           = &ip_rt_min_advmss,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = &proc_dointvec,
2993         },
2994         {
2995                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2996                 .procname       = "secret_interval",
2997                 .data           = &ip_rt_secret_interval,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = &proc_dointvec_jiffies,
3001                 .strategy       = &sysctl_jiffies,
3002         },
3003         { .ctl_name = 0 }
3004 };
3005 #endif
3006
3007 #ifdef CONFIG_NET_CLS_ROUTE
3008 struct ip_rt_acct *ip_rt_acct;
3009
3010 /* This code sucks.  But you should have seen it before! --RR */
3011
3012 /* IP route accounting ptr for this logical cpu number. */
3013 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3014
3015 #ifdef CONFIG_PROC_FS
3016 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3017                            int length, int *eof, void *data)
3018 {
3019         unsigned int i;
3020
3021         if ((offset & 3) || (length & 3))
3022                 return -EIO;
3023
3024         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3025                 *eof = 1;
3026                 return 0;
3027         }
3028
3029         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3030                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3031                 *eof = 1;
3032         }
3033
3034         offset /= sizeof(u32);
3035
3036         if (length > 0) {
3037                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3038                 u32 *dst = (u32 *) buffer;
3039
3040                 /* Copy first cpu. */
3041                 *start = buffer;
3042                 memcpy(dst, src, length);
3043
3044                 /* Add the other cpus in, one int at a time */
3045                 for_each_cpu(i) {
3046                         unsigned int j;
3047
3048                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3049
3050                         for (j = 0; j < length/4; j++)
3051                                 dst[j] += src[j];
3052                 }
3053         }
3054         return length;
3055 }
3056 #endif /* CONFIG_PROC_FS */
3057 #endif /* CONFIG_NET_CLS_ROUTE */
3058
3059 static __initdata unsigned long rhash_entries;
3060 static int __init set_rhash_entries(char *str)
3061 {
3062         if (!str)
3063                 return 0;
3064         rhash_entries = simple_strtoul(str, &str, 0);
3065         return 1;
3066 }
3067 __setup("rhash_entries=", set_rhash_entries);
3068
3069 int __init ip_rt_init(void)
3070 {
3071         int i, order, goal, rc = 0;
3072
3073         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074                              (jiffies ^ (jiffies >> 7)));
3075
3076 #ifdef CONFIG_NET_CLS_ROUTE
3077         for (order = 0;
3078              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079                 /* NOTHING */;
3080         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3081         if (!ip_rt_acct)
3082                 panic("IP: failed to allocate ip_rt_acct\n");
3083         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3084 #endif
3085
3086         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3087                                                      sizeof(struct rtable),
3088                                                      0, SLAB_HWCACHE_ALIGN,
3089                                                      NULL, NULL);
3090
3091         if (!ipv4_dst_ops.kmem_cachep)
3092                 panic("IP: failed to allocate ip_dst_cache\n");
3093
3094         goal = num_physpages >> (26 - PAGE_SHIFT);
3095         if (rhash_entries)
3096                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3097         for (order = 0; (1UL << order) < goal; order++)
3098                 /* NOTHING */;
3099
3100         do {
3101                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3102                         sizeof(struct rt_hash_bucket);
3103                 while (rt_hash_mask & (rt_hash_mask - 1))
3104                         rt_hash_mask--;
3105                 rt_hash_table = (struct rt_hash_bucket *)
3106                         __get_free_pages(GFP_ATOMIC, order);
3107         } while (rt_hash_table == NULL && --order > 0);
3108
3109         if (!rt_hash_table)
3110                 panic("Failed to allocate IP route cache hash table\n");
3111
3112         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113                rt_hash_mask,
3114                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115
3116         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117                 /* NOTHING */;
3118
3119         rt_hash_mask--;
3120         for (i = 0; i <= rt_hash_mask; i++) {
3121                 spin_lock_init(&rt_hash_table[i].lock);
3122                 rt_hash_table[i].chain = NULL;
3123         }
3124
3125         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3127
3128         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3129         if (!rt_cache_stat)
3130                 return -ENOMEM;
3131
3132         devinet_init();
3133         ip_fib_init();
3134
3135         init_timer(&rt_flush_timer);
3136         rt_flush_timer.function = rt_run_flush;
3137         init_timer(&rt_periodic_timer);
3138         rt_periodic_timer.function = rt_check_expire;
3139         init_timer(&rt_secret_timer);
3140         rt_secret_timer.function = rt_secret_rebuild;
3141
3142         /* All the timers, started at system startup tend
3143            to synchronize. Perturb it a bit.
3144          */
3145         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3146                                         ip_rt_gc_interval;
3147         add_timer(&rt_periodic_timer);
3148
3149         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3150                 ip_rt_secret_interval;
3151         add_timer(&rt_secret_timer);
3152
3153 #ifdef CONFIG_PROC_FS
3154         {
3155         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3156         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3157             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3158                                              proc_net_stat))) {
3159                 free_percpu(rt_cache_stat);
3160                 return -ENOMEM;
3161         }
3162         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3163         }
3164 #ifdef CONFIG_NET_CLS_ROUTE
3165         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3166 #endif
3167 #endif
3168 #ifdef CONFIG_XFRM
3169         xfrm_init();
3170         xfrm4_init();
3171 #endif
3172         return rc;
3173 }
3174
3175 EXPORT_SYMBOL(__ip_select_ident);
3176 EXPORT_SYMBOL(ip_route_input);
3177 EXPORT_SYMBOL(ip_route_output_key);