Merge 4.4.79 into android-4.4
[GitHub/exynos8895/android_kernel_samsung_universal8895.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly = 256;
131
132 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu);
144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150 WARN_ON(1);
151 return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155 struct sk_buff *skb,
156 const void *daddr);
157
158 static struct dst_ops ipv4_dst_ops = {
159 .family = AF_INET,
160 .check = ipv4_dst_check,
161 .default_advmss = ipv4_default_advmss,
162 .mtu = ipv4_mtu,
163 .cow_metrics = ipv4_cow_metrics,
164 .destroy = ipv4_dst_destroy,
165 .negative_advice = ipv4_negative_advice,
166 .link_failure = ipv4_link_failure,
167 .update_pmtu = ip_rt_update_pmtu,
168 .redirect = ip_do_redirect,
169 .local_out = __ip_local_out,
170 .neigh_lookup = ipv4_neigh_lookup,
171 };
172
173 #define ECN_OR_COST(class) TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176 TC_PRIO_BESTEFFORT,
177 ECN_OR_COST(BESTEFFORT),
178 TC_PRIO_BESTEFFORT,
179 ECN_OR_COST(BESTEFFORT),
180 TC_PRIO_BULK,
181 ECN_OR_COST(BULK),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_INTERACTIVE,
185 ECN_OR_COST(INTERACTIVE),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE_BULK,
189 ECN_OR_COST(INTERACTIVE_BULK),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201 if (*pos)
202 return NULL;
203 return SEQ_START_TOKEN;
204 }
205
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208 ++*pos;
209 return NULL;
210 }
211
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218 if (v == SEQ_START_TOKEN)
219 seq_printf(seq, "%-127s\n",
220 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222 "HHUptod\tSpecDst");
223 return 0;
224 }
225
226 static const struct seq_operations rt_cache_seq_ops = {
227 .start = rt_cache_seq_start,
228 .next = rt_cache_seq_next,
229 .stop = rt_cache_seq_stop,
230 .show = rt_cache_seq_show,
231 };
232
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235 return seq_open(file, &rt_cache_seq_ops);
236 }
237
238 static const struct file_operations rt_cache_seq_fops = {
239 .owner = THIS_MODULE,
240 .open = rt_cache_seq_open,
241 .read = seq_read,
242 .llseek = seq_lseek,
243 .release = seq_release,
244 };
245
246
247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
248 {
249 int cpu;
250
251 if (*pos == 0)
252 return SEQ_START_TOKEN;
253
254 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
255 if (!cpu_possible(cpu))
256 continue;
257 *pos = cpu+1;
258 return &per_cpu(rt_cache_stat, cpu);
259 }
260 return NULL;
261 }
262
263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264 {
265 int cpu;
266
267 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
268 if (!cpu_possible(cpu))
269 continue;
270 *pos = cpu+1;
271 return &per_cpu(rt_cache_stat, cpu);
272 }
273 return NULL;
274
275 }
276
277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
278 {
279
280 }
281
282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
283 {
284 struct rt_cache_stat *st = v;
285
286 if (v == SEQ_START_TOKEN) {
287 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288 return 0;
289 }
290
291 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
292 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293 dst_entries_get_slow(&ipv4_dst_ops),
294 0, /* st->in_hit */
295 st->in_slow_tot,
296 st->in_slow_mc,
297 st->in_no_route,
298 st->in_brd,
299 st->in_martian_dst,
300 st->in_martian_src,
301
302 0, /* st->out_hit */
303 st->out_slow_tot,
304 st->out_slow_mc,
305
306 0, /* st->gc_total */
307 0, /* st->gc_ignored */
308 0, /* st->gc_goal_miss */
309 0, /* st->gc_dst_overflow */
310 0, /* st->in_hlist_search */
311 0 /* st->out_hlist_search */
312 );
313 return 0;
314 }
315
316 static const struct seq_operations rt_cpu_seq_ops = {
317 .start = rt_cpu_seq_start,
318 .next = rt_cpu_seq_next,
319 .stop = rt_cpu_seq_stop,
320 .show = rt_cpu_seq_show,
321 };
322
323
324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
325 {
326 return seq_open(file, &rt_cpu_seq_ops);
327 }
328
329 static const struct file_operations rt_cpu_seq_fops = {
330 .owner = THIS_MODULE,
331 .open = rt_cpu_seq_open,
332 .read = seq_read,
333 .llseek = seq_lseek,
334 .release = seq_release,
335 };
336
337 #ifdef CONFIG_IP_ROUTE_CLASSID
338 static int rt_acct_proc_show(struct seq_file *m, void *v)
339 {
340 struct ip_rt_acct *dst, *src;
341 unsigned int i, j;
342
343 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
344 if (!dst)
345 return -ENOMEM;
346
347 for_each_possible_cpu(i) {
348 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
349 for (j = 0; j < 256; j++) {
350 dst[j].o_bytes += src[j].o_bytes;
351 dst[j].o_packets += src[j].o_packets;
352 dst[j].i_bytes += src[j].i_bytes;
353 dst[j].i_packets += src[j].i_packets;
354 }
355 }
356
357 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
358 kfree(dst);
359 return 0;
360 }
361
362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
363 {
364 return single_open(file, rt_acct_proc_show, NULL);
365 }
366
367 static const struct file_operations rt_acct_proc_fops = {
368 .owner = THIS_MODULE,
369 .open = rt_acct_proc_open,
370 .read = seq_read,
371 .llseek = seq_lseek,
372 .release = single_release,
373 };
374 #endif
375
376 static int __net_init ip_rt_do_proc_init(struct net *net)
377 {
378 struct proc_dir_entry *pde;
379
380 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
381 &rt_cache_seq_fops);
382 if (!pde)
383 goto err1;
384
385 pde = proc_create("rt_cache", S_IRUGO,
386 net->proc_net_stat, &rt_cpu_seq_fops);
387 if (!pde)
388 goto err2;
389
390 #ifdef CONFIG_IP_ROUTE_CLASSID
391 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
392 if (!pde)
393 goto err3;
394 #endif
395 return 0;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 err3:
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 #endif
401 err2:
402 remove_proc_entry("rt_cache", net->proc_net);
403 err1:
404 return -ENOMEM;
405 }
406
407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
408 {
409 remove_proc_entry("rt_cache", net->proc_net_stat);
410 remove_proc_entry("rt_cache", net->proc_net);
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412 remove_proc_entry("rt_acct", net->proc_net);
413 #endif
414 }
415
416 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
417 .init = ip_rt_do_proc_init,
418 .exit = ip_rt_do_proc_exit,
419 };
420
421 static int __init ip_rt_proc_init(void)
422 {
423 return register_pernet_subsys(&ip_rt_proc_ops);
424 }
425
426 #else
427 static inline int ip_rt_proc_init(void)
428 {
429 return 0;
430 }
431 #endif /* CONFIG_PROC_FS */
432
433 static inline bool rt_is_expired(const struct rtable *rth)
434 {
435 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
436 }
437
438 void rt_cache_flush(struct net *net)
439 {
440 rt_genid_bump_ipv4(net);
441 }
442
443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
444 struct sk_buff *skb,
445 const void *daddr)
446 {
447 struct net_device *dev = dst->dev;
448 const __be32 *pkey = daddr;
449 const struct rtable *rt;
450 struct neighbour *n;
451
452 rt = (const struct rtable *) dst;
453 if (rt->rt_gateway)
454 pkey = (const __be32 *) &rt->rt_gateway;
455 else if (skb)
456 pkey = &ip_hdr(skb)->daddr;
457
458 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
459 if (n)
460 return n;
461 return neigh_create(&arp_tbl, pkey, dev);
462 }
463
464 #define IP_IDENTS_SZ 2048u
465
466 static atomic_t *ip_idents __read_mostly;
467 static u32 *ip_tstamps __read_mostly;
468
469 /* In order to protect privacy, we add a perturbation to identifiers
470 * if one generator is seldom used. This makes hard for an attacker
471 * to infer how many packets were sent between two points in time.
472 */
473 u32 ip_idents_reserve(u32 hash, int segs)
474 {
475 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
476 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
477 u32 old = ACCESS_ONCE(*p_tstamp);
478 u32 now = (u32)jiffies;
479 u32 delta = 0;
480
481 if (old != now && cmpxchg(p_tstamp, old, now) == old)
482 delta = prandom_u32_max(now - old);
483
484 return atomic_add_return(segs + delta, p_id) - segs;
485 }
486 EXPORT_SYMBOL(ip_idents_reserve);
487
488 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
489 {
490 static u32 ip_idents_hashrnd __read_mostly;
491 u32 hash, id;
492
493 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
494
495 hash = jhash_3words((__force u32)iph->daddr,
496 (__force u32)iph->saddr,
497 iph->protocol ^ net_hash_mix(net),
498 ip_idents_hashrnd);
499 id = ip_idents_reserve(hash, segs);
500 iph->id = htons(id);
501 }
502 EXPORT_SYMBOL(__ip_select_ident);
503
504 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
505 const struct sock *sk,
506 const struct iphdr *iph,
507 int oif, u8 tos,
508 u8 prot, u32 mark, int flow_flags)
509 {
510 if (sk) {
511 const struct inet_sock *inet = inet_sk(sk);
512
513 oif = sk->sk_bound_dev_if;
514 mark = sk->sk_mark;
515 tos = RT_CONN_FLAGS(sk);
516 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
517 }
518 flowi4_init_output(fl4, oif, mark, tos,
519 RT_SCOPE_UNIVERSE, prot,
520 flow_flags,
521 iph->daddr, iph->saddr, 0, 0,
522 sock_net_uid(net, sk));
523 }
524
525 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
526 const struct sock *sk)
527 {
528 const struct net *net = dev_net(skb->dev);
529 const struct iphdr *iph = ip_hdr(skb);
530 int oif = skb->dev->ifindex;
531 u8 tos = RT_TOS(iph->tos);
532 u8 prot = iph->protocol;
533 u32 mark = skb->mark;
534
535 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540 const struct inet_sock *inet = inet_sk(sk);
541 const struct ip_options_rcu *inet_opt;
542 __be32 daddr = inet->inet_daddr;
543
544 rcu_read_lock();
545 inet_opt = rcu_dereference(inet->inet_opt);
546 if (inet_opt && inet_opt->opt.srr)
547 daddr = inet_opt->opt.faddr;
548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 inet_sk_flowi_flags(sk),
552 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
553 rcu_read_unlock();
554 }
555
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 const struct sk_buff *skb)
558 {
559 if (skb)
560 build_skb_flow_key(fl4, skb, sk);
561 else
562 build_sk_flow_key(fl4, sk);
563 }
564
565 static inline void rt_free(struct rtable *rt)
566 {
567 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569
570 static DEFINE_SPINLOCK(fnhe_lock);
571
572 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
573 {
574 struct rtable *rt;
575
576 rt = rcu_dereference(fnhe->fnhe_rth_input);
577 if (rt) {
578 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
579 rt_free(rt);
580 }
581 rt = rcu_dereference(fnhe->fnhe_rth_output);
582 if (rt) {
583 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
584 rt_free(rt);
585 }
586 }
587
588 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
589 {
590 struct fib_nh_exception *fnhe, *oldest;
591
592 oldest = rcu_dereference(hash->chain);
593 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
594 fnhe = rcu_dereference(fnhe->fnhe_next)) {
595 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
596 oldest = fnhe;
597 }
598 fnhe_flush_routes(oldest);
599 return oldest;
600 }
601
602 static inline u32 fnhe_hashfun(__be32 daddr)
603 {
604 static u32 fnhe_hashrnd __read_mostly;
605 u32 hval;
606
607 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
608 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
609 return hash_32(hval, FNHE_HASH_SHIFT);
610 }
611
612 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
613 {
614 rt->rt_pmtu = fnhe->fnhe_pmtu;
615 rt->dst.expires = fnhe->fnhe_expires;
616
617 if (fnhe->fnhe_gw) {
618 rt->rt_flags |= RTCF_REDIRECTED;
619 rt->rt_gateway = fnhe->fnhe_gw;
620 rt->rt_uses_gateway = 1;
621 }
622 }
623
624 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
625 u32 pmtu, unsigned long expires)
626 {
627 struct fnhe_hash_bucket *hash;
628 struct fib_nh_exception *fnhe;
629 struct rtable *rt;
630 unsigned int i;
631 int depth;
632 u32 hval = fnhe_hashfun(daddr);
633
634 spin_lock_bh(&fnhe_lock);
635
636 hash = rcu_dereference(nh->nh_exceptions);
637 if (!hash) {
638 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
639 if (!hash)
640 goto out_unlock;
641 rcu_assign_pointer(nh->nh_exceptions, hash);
642 }
643
644 hash += hval;
645
646 depth = 0;
647 for (fnhe = rcu_dereference(hash->chain); fnhe;
648 fnhe = rcu_dereference(fnhe->fnhe_next)) {
649 if (fnhe->fnhe_daddr == daddr)
650 break;
651 depth++;
652 }
653
654 if (fnhe) {
655 if (gw)
656 fnhe->fnhe_gw = gw;
657 if (pmtu) {
658 fnhe->fnhe_pmtu = pmtu;
659 fnhe->fnhe_expires = max(1UL, expires);
660 }
661 /* Update all cached dsts too */
662 rt = rcu_dereference(fnhe->fnhe_rth_input);
663 if (rt)
664 fill_route_from_fnhe(rt, fnhe);
665 rt = rcu_dereference(fnhe->fnhe_rth_output);
666 if (rt)
667 fill_route_from_fnhe(rt, fnhe);
668 } else {
669 if (depth > FNHE_RECLAIM_DEPTH)
670 fnhe = fnhe_oldest(hash);
671 else {
672 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
673 if (!fnhe)
674 goto out_unlock;
675
676 fnhe->fnhe_next = hash->chain;
677 rcu_assign_pointer(hash->chain, fnhe);
678 }
679 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
680 fnhe->fnhe_daddr = daddr;
681 fnhe->fnhe_gw = gw;
682 fnhe->fnhe_pmtu = pmtu;
683 fnhe->fnhe_expires = expires;
684
685 /* Exception created; mark the cached routes for the nexthop
686 * stale, so anyone caching it rechecks if this exception
687 * applies to them.
688 */
689 rt = rcu_dereference(nh->nh_rth_input);
690 if (rt)
691 rt->dst.obsolete = DST_OBSOLETE_KILL;
692
693 for_each_possible_cpu(i) {
694 struct rtable __rcu **prt;
695 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
696 rt = rcu_dereference(*prt);
697 if (rt)
698 rt->dst.obsolete = DST_OBSOLETE_KILL;
699 }
700 }
701
702 fnhe->fnhe_stamp = jiffies;
703
704 out_unlock:
705 spin_unlock_bh(&fnhe_lock);
706 }
707
708 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
709 bool kill_route)
710 {
711 __be32 new_gw = icmp_hdr(skb)->un.gateway;
712 __be32 old_gw = ip_hdr(skb)->saddr;
713 struct net_device *dev = skb->dev;
714 struct in_device *in_dev;
715 struct fib_result res;
716 struct neighbour *n;
717 struct net *net;
718
719 switch (icmp_hdr(skb)->code & 7) {
720 case ICMP_REDIR_NET:
721 case ICMP_REDIR_NETTOS:
722 case ICMP_REDIR_HOST:
723 case ICMP_REDIR_HOSTTOS:
724 break;
725
726 default:
727 return;
728 }
729
730 if (rt->rt_gateway != old_gw)
731 return;
732
733 in_dev = __in_dev_get_rcu(dev);
734 if (!in_dev)
735 return;
736
737 net = dev_net(dev);
738 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
739 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
740 ipv4_is_zeronet(new_gw))
741 goto reject_redirect;
742
743 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
744 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
745 goto reject_redirect;
746 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
747 goto reject_redirect;
748 } else {
749 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
750 goto reject_redirect;
751 }
752
753 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
754 if (!n)
755 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
756 if (!IS_ERR(n)) {
757 if (!(n->nud_state & NUD_VALID)) {
758 neigh_event_send(n, NULL);
759 } else {
760 if (fib_lookup(net, fl4, &res, 0) == 0) {
761 struct fib_nh *nh = &FIB_RES_NH(res);
762
763 update_or_create_fnhe(nh, fl4->daddr, new_gw,
764 0, jiffies + ip_rt_gc_timeout);
765 }
766 if (kill_route)
767 rt->dst.obsolete = DST_OBSOLETE_KILL;
768 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
769 }
770 neigh_release(n);
771 }
772 return;
773
774 reject_redirect:
775 #ifdef CONFIG_IP_ROUTE_VERBOSE
776 if (IN_DEV_LOG_MARTIANS(in_dev)) {
777 const struct iphdr *iph = (const struct iphdr *) skb->data;
778 __be32 daddr = iph->daddr;
779 __be32 saddr = iph->saddr;
780
781 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
782 " Advised path = %pI4 -> %pI4\n",
783 &old_gw, dev->name, &new_gw,
784 &saddr, &daddr);
785 }
786 #endif
787 ;
788 }
789
790 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
791 {
792 struct rtable *rt;
793 struct flowi4 fl4;
794 const struct iphdr *iph = (const struct iphdr *) skb->data;
795 struct net *net = dev_net(skb->dev);
796 int oif = skb->dev->ifindex;
797 u8 tos = RT_TOS(iph->tos);
798 u8 prot = iph->protocol;
799 u32 mark = skb->mark;
800
801 rt = (struct rtable *) dst;
802
803 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
804 __ip_do_redirect(rt, skb, &fl4, true);
805 }
806
807 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
808 {
809 struct rtable *rt = (struct rtable *)dst;
810 struct dst_entry *ret = dst;
811
812 if (rt) {
813 if (dst->obsolete > 0) {
814 ip_rt_put(rt);
815 ret = NULL;
816 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
817 rt->dst.expires) {
818 ip_rt_put(rt);
819 ret = NULL;
820 }
821 }
822 return ret;
823 }
824
825 /*
826 * Algorithm:
827 * 1. The first ip_rt_redirect_number redirects are sent
828 * with exponential backoff, then we stop sending them at all,
829 * assuming that the host ignores our redirects.
830 * 2. If we did not see packets requiring redirects
831 * during ip_rt_redirect_silence, we assume that the host
832 * forgot redirected route and start to send redirects again.
833 *
834 * This algorithm is much cheaper and more intelligent than dumb load limiting
835 * in icmp.c.
836 *
837 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
838 * and "frag. need" (breaks PMTU discovery) in icmp.c.
839 */
840
841 void ip_rt_send_redirect(struct sk_buff *skb)
842 {
843 struct rtable *rt = skb_rtable(skb);
844 struct in_device *in_dev;
845 struct inet_peer *peer;
846 struct net *net;
847 int log_martians;
848 int vif;
849
850 rcu_read_lock();
851 in_dev = __in_dev_get_rcu(rt->dst.dev);
852 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
853 rcu_read_unlock();
854 return;
855 }
856 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
857 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
858 rcu_read_unlock();
859
860 net = dev_net(rt->dst.dev);
861 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
862 if (!peer) {
863 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
864 rt_nexthop(rt, ip_hdr(skb)->daddr));
865 return;
866 }
867
868 /* No redirected packets during ip_rt_redirect_silence;
869 * reset the algorithm.
870 */
871 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
872 peer->rate_tokens = 0;
873
874 /* Too many ignored redirects; do not send anything
875 * set dst.rate_last to the last seen redirected packet.
876 */
877 if (peer->rate_tokens >= ip_rt_redirect_number) {
878 peer->rate_last = jiffies;
879 goto out_put_peer;
880 }
881
882 /* Check for load limit; set rate_last to the latest sent
883 * redirect.
884 */
885 if (peer->rate_tokens == 0 ||
886 time_after(jiffies,
887 (peer->rate_last +
888 (ip_rt_redirect_load << peer->rate_tokens)))) {
889 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
890
891 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
892 peer->rate_last = jiffies;
893 ++peer->rate_tokens;
894 #ifdef CONFIG_IP_ROUTE_VERBOSE
895 if (log_martians &&
896 peer->rate_tokens == ip_rt_redirect_number)
897 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
898 &ip_hdr(skb)->saddr, inet_iif(skb),
899 &ip_hdr(skb)->daddr, &gw);
900 #endif
901 }
902 out_put_peer:
903 inet_putpeer(peer);
904 }
905
906 static int ip_error(struct sk_buff *skb)
907 {
908 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
909 struct rtable *rt = skb_rtable(skb);
910 struct inet_peer *peer;
911 unsigned long now;
912 struct net *net;
913 bool send;
914 int code;
915
916 /* IP on this device is disabled. */
917 if (!in_dev)
918 goto out;
919
920 net = dev_net(rt->dst.dev);
921 if (!IN_DEV_FORWARD(in_dev)) {
922 switch (rt->dst.error) {
923 case EHOSTUNREACH:
924 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
925 break;
926
927 case ENETUNREACH:
928 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
929 break;
930 }
931 goto out;
932 }
933
934 switch (rt->dst.error) {
935 case EINVAL:
936 default:
937 goto out;
938 case EHOSTUNREACH:
939 code = ICMP_HOST_UNREACH;
940 break;
941 case ENETUNREACH:
942 code = ICMP_NET_UNREACH;
943 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
944 break;
945 case EACCES:
946 code = ICMP_PKT_FILTERED;
947 break;
948 }
949
950 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
951 l3mdev_master_ifindex(skb->dev), 1);
952
953 send = true;
954 if (peer) {
955 now = jiffies;
956 peer->rate_tokens += now - peer->rate_last;
957 if (peer->rate_tokens > ip_rt_error_burst)
958 peer->rate_tokens = ip_rt_error_burst;
959 peer->rate_last = now;
960 if (peer->rate_tokens >= ip_rt_error_cost)
961 peer->rate_tokens -= ip_rt_error_cost;
962 else
963 send = false;
964 inet_putpeer(peer);
965 }
966 if (send)
967 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
968
969 out: kfree_skb(skb);
970 return 0;
971 }
972
973 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
974 {
975 struct dst_entry *dst = &rt->dst;
976 struct fib_result res;
977
978 if (dst_metric_locked(dst, RTAX_MTU))
979 return;
980
981 if (ipv4_mtu(dst) < mtu)
982 return;
983
984 if (mtu < ip_rt_min_pmtu)
985 mtu = ip_rt_min_pmtu;
986
987 if (rt->rt_pmtu == mtu &&
988 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
989 return;
990
991 rcu_read_lock();
992 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
993 struct fib_nh *nh = &FIB_RES_NH(res);
994
995 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
996 jiffies + ip_rt_mtu_expires);
997 }
998 rcu_read_unlock();
999 }
1000
1001 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1002 struct sk_buff *skb, u32 mtu)
1003 {
1004 struct rtable *rt = (struct rtable *) dst;
1005 struct flowi4 fl4;
1006
1007 ip_rt_build_flow_key(&fl4, sk, skb);
1008 __ip_rt_update_pmtu(rt, &fl4, mtu);
1009 }
1010
1011 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1012 int oif, u32 mark, u8 protocol, int flow_flags)
1013 {
1014 const struct iphdr *iph = (const struct iphdr *) skb->data;
1015 struct flowi4 fl4;
1016 struct rtable *rt;
1017
1018 if (!mark)
1019 mark = IP4_REPLY_MARK(net, skb->mark);
1020
1021 __build_flow_key(net, &fl4, NULL, iph, oif,
1022 RT_TOS(iph->tos), protocol, mark, flow_flags);
1023 rt = __ip_route_output_key(net, &fl4);
1024 if (!IS_ERR(rt)) {
1025 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026 ip_rt_put(rt);
1027 }
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1030
1031 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1032 {
1033 const struct iphdr *iph = (const struct iphdr *) skb->data;
1034 struct flowi4 fl4;
1035 struct rtable *rt;
1036
1037 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1038
1039 if (!fl4.flowi4_mark)
1040 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1041
1042 rt = __ip_route_output_key(sock_net(sk), &fl4);
1043 if (!IS_ERR(rt)) {
1044 __ip_rt_update_pmtu(rt, &fl4, mtu);
1045 ip_rt_put(rt);
1046 }
1047 }
1048
1049 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1050 {
1051 const struct iphdr *iph = (const struct iphdr *) skb->data;
1052 struct flowi4 fl4;
1053 struct rtable *rt;
1054 struct dst_entry *odst = NULL;
1055 bool new = false;
1056 struct net *net = sock_net(sk);
1057
1058 bh_lock_sock(sk);
1059
1060 if (!ip_sk_accept_pmtu(sk))
1061 goto out;
1062
1063 odst = sk_dst_get(sk);
1064
1065 if (sock_owned_by_user(sk) || !odst) {
1066 __ipv4_sk_update_pmtu(skb, sk, mtu);
1067 goto out;
1068 }
1069
1070 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1071
1072 rt = (struct rtable *)odst;
1073 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1074 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1075 if (IS_ERR(rt))
1076 goto out;
1077
1078 new = true;
1079 }
1080
1081 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1082
1083 if (!dst_check(&rt->dst, 0)) {
1084 if (new)
1085 dst_release(&rt->dst);
1086
1087 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1088 if (IS_ERR(rt))
1089 goto out;
1090
1091 new = true;
1092 }
1093
1094 if (new)
1095 sk_dst_set(sk, &rt->dst);
1096
1097 out:
1098 bh_unlock_sock(sk);
1099 dst_release(odst);
1100 }
1101 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1102
1103 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1104 int oif, u32 mark, u8 protocol, int flow_flags)
1105 {
1106 const struct iphdr *iph = (const struct iphdr *) skb->data;
1107 struct flowi4 fl4;
1108 struct rtable *rt;
1109
1110 __build_flow_key(net, &fl4, NULL, iph, oif,
1111 RT_TOS(iph->tos), protocol, mark, flow_flags);
1112 rt = __ip_route_output_key(net, &fl4);
1113 if (!IS_ERR(rt)) {
1114 __ip_do_redirect(rt, skb, &fl4, false);
1115 ip_rt_put(rt);
1116 }
1117 }
1118 EXPORT_SYMBOL_GPL(ipv4_redirect);
1119
1120 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1121 {
1122 const struct iphdr *iph = (const struct iphdr *) skb->data;
1123 struct flowi4 fl4;
1124 struct rtable *rt;
1125 struct net *net = sock_net(sk);
1126
1127 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1128 rt = __ip_route_output_key(net, &fl4);
1129 if (!IS_ERR(rt)) {
1130 __ip_do_redirect(rt, skb, &fl4, false);
1131 ip_rt_put(rt);
1132 }
1133 }
1134 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1135
1136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1137 {
1138 struct rtable *rt = (struct rtable *) dst;
1139
1140 /* All IPV4 dsts are created with ->obsolete set to the value
1141 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1142 * into this function always.
1143 *
1144 * When a PMTU/redirect information update invalidates a route,
1145 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1146 * DST_OBSOLETE_DEAD by dst_free().
1147 */
1148 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1149 return NULL;
1150 return dst;
1151 }
1152
1153 static void ipv4_link_failure(struct sk_buff *skb)
1154 {
1155 struct rtable *rt;
1156
1157 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1158
1159 rt = skb_rtable(skb);
1160 if (rt)
1161 dst_set_expires(&rt->dst, 0);
1162 }
1163
1164 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1165 {
1166 pr_debug("%s: %pI4 -> %pI4, %s\n",
1167 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1168 skb->dev ? skb->dev->name : "?");
1169 kfree_skb(skb);
1170 WARN_ON(1);
1171 return 0;
1172 }
1173
1174 /*
1175 We do not cache source address of outgoing interface,
1176 because it is used only by IP RR, TS and SRR options,
1177 so that it out of fast path.
1178
1179 BTW remember: "addr" is allowed to be not aligned
1180 in IP options!
1181 */
1182
1183 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1184 {
1185 __be32 src;
1186
1187 if (rt_is_output_route(rt))
1188 src = ip_hdr(skb)->saddr;
1189 else {
1190 struct fib_result res;
1191 struct flowi4 fl4;
1192 struct iphdr *iph;
1193
1194 iph = ip_hdr(skb);
1195
1196 memset(&fl4, 0, sizeof(fl4));
1197 fl4.daddr = iph->daddr;
1198 fl4.saddr = iph->saddr;
1199 fl4.flowi4_tos = RT_TOS(iph->tos);
1200 fl4.flowi4_oif = rt->dst.dev->ifindex;
1201 fl4.flowi4_iif = skb->dev->ifindex;
1202 fl4.flowi4_mark = skb->mark;
1203
1204 rcu_read_lock();
1205 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1206 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1207 else
1208 src = inet_select_addr(rt->dst.dev,
1209 rt_nexthop(rt, iph->daddr),
1210 RT_SCOPE_UNIVERSE);
1211 rcu_read_unlock();
1212 }
1213 memcpy(addr, &src, 4);
1214 }
1215
1216 #ifdef CONFIG_IP_ROUTE_CLASSID
1217 static void set_class_tag(struct rtable *rt, u32 tag)
1218 {
1219 if (!(rt->dst.tclassid & 0xFFFF))
1220 rt->dst.tclassid |= tag & 0xFFFF;
1221 if (!(rt->dst.tclassid & 0xFFFF0000))
1222 rt->dst.tclassid |= tag & 0xFFFF0000;
1223 }
1224 #endif
1225
1226 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1227 {
1228 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1229
1230 if (advmss == 0) {
1231 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1232 ip_rt_min_advmss);
1233 if (advmss > 65535 - 40)
1234 advmss = 65535 - 40;
1235 }
1236 return advmss;
1237 }
1238
1239 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1240 {
1241 const struct rtable *rt = (const struct rtable *) dst;
1242 unsigned int mtu = rt->rt_pmtu;
1243
1244 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1245 mtu = dst_metric_raw(dst, RTAX_MTU);
1246
1247 if (mtu)
1248 return mtu;
1249
1250 mtu = dst->dev->mtu;
1251
1252 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1253 if (rt->rt_uses_gateway && mtu > 576)
1254 mtu = 576;
1255 }
1256
1257 return min_t(unsigned int, mtu, IP_MAX_MTU);
1258 }
1259
1260 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1261 {
1262 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1263 struct fib_nh_exception *fnhe;
1264 u32 hval;
1265
1266 if (!hash)
1267 return NULL;
1268
1269 hval = fnhe_hashfun(daddr);
1270
1271 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1272 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1273 if (fnhe->fnhe_daddr == daddr)
1274 return fnhe;
1275 }
1276 return NULL;
1277 }
1278
1279 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1280 __be32 daddr)
1281 {
1282 bool ret = false;
1283
1284 spin_lock_bh(&fnhe_lock);
1285
1286 if (daddr == fnhe->fnhe_daddr) {
1287 struct rtable __rcu **porig;
1288 struct rtable *orig;
1289 int genid = fnhe_genid(dev_net(rt->dst.dev));
1290
1291 if (rt_is_input_route(rt))
1292 porig = &fnhe->fnhe_rth_input;
1293 else
1294 porig = &fnhe->fnhe_rth_output;
1295 orig = rcu_dereference(*porig);
1296
1297 if (fnhe->fnhe_genid != genid) {
1298 fnhe->fnhe_genid = genid;
1299 fnhe->fnhe_gw = 0;
1300 fnhe->fnhe_pmtu = 0;
1301 fnhe->fnhe_expires = 0;
1302 fnhe_flush_routes(fnhe);
1303 orig = NULL;
1304 }
1305 fill_route_from_fnhe(rt, fnhe);
1306 if (!rt->rt_gateway)
1307 rt->rt_gateway = daddr;
1308
1309 if (!(rt->dst.flags & DST_NOCACHE)) {
1310 rcu_assign_pointer(*porig, rt);
1311 if (orig)
1312 rt_free(orig);
1313 ret = true;
1314 }
1315
1316 fnhe->fnhe_stamp = jiffies;
1317 }
1318 spin_unlock_bh(&fnhe_lock);
1319
1320 return ret;
1321 }
1322
1323 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1324 {
1325 struct rtable *orig, *prev, **p;
1326 bool ret = true;
1327
1328 if (rt_is_input_route(rt)) {
1329 p = (struct rtable **)&nh->nh_rth_input;
1330 } else {
1331 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1332 }
1333 orig = *p;
1334
1335 prev = cmpxchg(p, orig, rt);
1336 if (prev == orig) {
1337 if (orig)
1338 rt_free(orig);
1339 } else
1340 ret = false;
1341
1342 return ret;
1343 }
1344
1345 struct uncached_list {
1346 spinlock_t lock;
1347 struct list_head head;
1348 };
1349
1350 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1351
1352 static void rt_add_uncached_list(struct rtable *rt)
1353 {
1354 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1355
1356 rt->rt_uncached_list = ul;
1357
1358 spin_lock_bh(&ul->lock);
1359 list_add_tail(&rt->rt_uncached, &ul->head);
1360 spin_unlock_bh(&ul->lock);
1361 }
1362
1363 static void ipv4_dst_destroy(struct dst_entry *dst)
1364 {
1365 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1366 struct rtable *rt = (struct rtable *) dst;
1367
1368 if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1369 kfree(p);
1370
1371 if (!list_empty(&rt->rt_uncached)) {
1372 struct uncached_list *ul = rt->rt_uncached_list;
1373
1374 spin_lock_bh(&ul->lock);
1375 list_del(&rt->rt_uncached);
1376 spin_unlock_bh(&ul->lock);
1377 }
1378 }
1379
1380 void rt_flush_dev(struct net_device *dev)
1381 {
1382 struct net *net = dev_net(dev);
1383 struct rtable *rt;
1384 int cpu;
1385
1386 for_each_possible_cpu(cpu) {
1387 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1388
1389 spin_lock_bh(&ul->lock);
1390 list_for_each_entry(rt, &ul->head, rt_uncached) {
1391 if (rt->dst.dev != dev)
1392 continue;
1393 rt->dst.dev = net->loopback_dev;
1394 dev_hold(rt->dst.dev);
1395 dev_put(dev);
1396 }
1397 spin_unlock_bh(&ul->lock);
1398 }
1399 }
1400
1401 static bool rt_cache_valid(const struct rtable *rt)
1402 {
1403 return rt &&
1404 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1405 !rt_is_expired(rt);
1406 }
1407
1408 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1409 const struct fib_result *res,
1410 struct fib_nh_exception *fnhe,
1411 struct fib_info *fi, u16 type, u32 itag)
1412 {
1413 bool cached = false;
1414
1415 if (fi) {
1416 struct fib_nh *nh = &FIB_RES_NH(*res);
1417
1418 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1419 rt->rt_gateway = nh->nh_gw;
1420 rt->rt_uses_gateway = 1;
1421 }
1422 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1423 if (fi->fib_metrics != &dst_default_metrics) {
1424 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1425 atomic_inc(&fi->fib_metrics->refcnt);
1426 }
1427 #ifdef CONFIG_IP_ROUTE_CLASSID
1428 rt->dst.tclassid = nh->nh_tclassid;
1429 #endif
1430 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1431 if (unlikely(fnhe))
1432 cached = rt_bind_exception(rt, fnhe, daddr);
1433 else if (!(rt->dst.flags & DST_NOCACHE))
1434 cached = rt_cache_route(nh, rt);
1435 if (unlikely(!cached)) {
1436 /* Routes we intend to cache in nexthop exception or
1437 * FIB nexthop have the DST_NOCACHE bit clear.
1438 * However, if we are unsuccessful at storing this
1439 * route into the cache we really need to set it.
1440 */
1441 rt->dst.flags |= DST_NOCACHE;
1442 if (!rt->rt_gateway)
1443 rt->rt_gateway = daddr;
1444 rt_add_uncached_list(rt);
1445 }
1446 } else
1447 rt_add_uncached_list(rt);
1448
1449 #ifdef CONFIG_IP_ROUTE_CLASSID
1450 #ifdef CONFIG_IP_MULTIPLE_TABLES
1451 set_class_tag(rt, res->tclassid);
1452 #endif
1453 set_class_tag(rt, itag);
1454 #endif
1455 }
1456
1457 static struct rtable *rt_dst_alloc(struct net_device *dev,
1458 unsigned int flags, u16 type,
1459 bool nopolicy, bool noxfrm, bool will_cache)
1460 {
1461 struct rtable *rt;
1462
1463 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1464 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1465 (nopolicy ? DST_NOPOLICY : 0) |
1466 (noxfrm ? DST_NOXFRM : 0));
1467
1468 if (rt) {
1469 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1470 rt->rt_flags = flags;
1471 rt->rt_type = type;
1472 rt->rt_is_input = 0;
1473 rt->rt_iif = 0;
1474 rt->rt_pmtu = 0;
1475 rt->rt_gateway = 0;
1476 rt->rt_uses_gateway = 0;
1477 rt->rt_table_id = 0;
1478 INIT_LIST_HEAD(&rt->rt_uncached);
1479
1480 rt->dst.output = ip_output;
1481 if (flags & RTCF_LOCAL)
1482 rt->dst.input = ip_local_deliver;
1483 }
1484
1485 return rt;
1486 }
1487
1488 /* called in rcu_read_lock() section */
1489 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1490 u8 tos, struct net_device *dev, int our)
1491 {
1492 struct rtable *rth;
1493 struct in_device *in_dev = __in_dev_get_rcu(dev);
1494 unsigned int flags = RTCF_MULTICAST;
1495 u32 itag = 0;
1496 int err;
1497
1498 /* Primary sanity checks. */
1499
1500 if (!in_dev)
1501 return -EINVAL;
1502
1503 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1504 skb->protocol != htons(ETH_P_IP))
1505 goto e_inval;
1506
1507 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1508 goto e_inval;
1509
1510 if (ipv4_is_zeronet(saddr)) {
1511 if (!ipv4_is_local_multicast(daddr))
1512 goto e_inval;
1513 } else {
1514 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1515 in_dev, &itag);
1516 if (err < 0)
1517 goto e_err;
1518 }
1519 if (our)
1520 flags |= RTCF_LOCAL;
1521
1522 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1523 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1524 if (!rth)
1525 goto e_nobufs;
1526
1527 #ifdef CONFIG_IP_ROUTE_CLASSID
1528 rth->dst.tclassid = itag;
1529 #endif
1530 rth->dst.output = ip_rt_bug;
1531 rth->rt_is_input= 1;
1532
1533 #ifdef CONFIG_IP_MROUTE
1534 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1535 rth->dst.input = ip_mr_input;
1536 #endif
1537 RT_CACHE_STAT_INC(in_slow_mc);
1538
1539 skb_dst_set(skb, &rth->dst);
1540 return 0;
1541
1542 e_nobufs:
1543 return -ENOBUFS;
1544 e_inval:
1545 return -EINVAL;
1546 e_err:
1547 return err;
1548 }
1549
1550
1551 static void ip_handle_martian_source(struct net_device *dev,
1552 struct in_device *in_dev,
1553 struct sk_buff *skb,
1554 __be32 daddr,
1555 __be32 saddr)
1556 {
1557 RT_CACHE_STAT_INC(in_martian_src);
1558 #ifdef CONFIG_IP_ROUTE_VERBOSE
1559 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1560 /*
1561 * RFC1812 recommendation, if source is martian,
1562 * the only hint is MAC header.
1563 */
1564 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1565 &daddr, &saddr, dev->name);
1566 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1567 print_hex_dump(KERN_WARNING, "ll header: ",
1568 DUMP_PREFIX_OFFSET, 16, 1,
1569 skb_mac_header(skb),
1570 dev->hard_header_len, true);
1571 }
1572 }
1573 #endif
1574 }
1575
1576 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1577 {
1578 struct fnhe_hash_bucket *hash;
1579 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1580 u32 hval = fnhe_hashfun(daddr);
1581
1582 spin_lock_bh(&fnhe_lock);
1583
1584 hash = rcu_dereference_protected(nh->nh_exceptions,
1585 lockdep_is_held(&fnhe_lock));
1586 hash += hval;
1587
1588 fnhe_p = &hash->chain;
1589 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1590 while (fnhe) {
1591 if (fnhe->fnhe_daddr == daddr) {
1592 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1593 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1594 fnhe_flush_routes(fnhe);
1595 kfree_rcu(fnhe, rcu);
1596 break;
1597 }
1598 fnhe_p = &fnhe->fnhe_next;
1599 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1600 lockdep_is_held(&fnhe_lock));
1601 }
1602
1603 spin_unlock_bh(&fnhe_lock);
1604 }
1605
1606 /* called in rcu_read_lock() section */
1607 static int __mkroute_input(struct sk_buff *skb,
1608 const struct fib_result *res,
1609 struct in_device *in_dev,
1610 __be32 daddr, __be32 saddr, u32 tos)
1611 {
1612 struct fib_nh_exception *fnhe;
1613 struct rtable *rth;
1614 int err;
1615 struct in_device *out_dev;
1616 bool do_cache;
1617 u32 itag = 0;
1618
1619 /* get a working reference to the output device */
1620 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1621 if (!out_dev) {
1622 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1623 return -EINVAL;
1624 }
1625
1626 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1627 in_dev->dev, in_dev, &itag);
1628 if (err < 0) {
1629 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1630 saddr);
1631
1632 goto cleanup;
1633 }
1634
1635 do_cache = res->fi && !itag;
1636 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1637 skb->protocol == htons(ETH_P_IP) &&
1638 (IN_DEV_SHARED_MEDIA(out_dev) ||
1639 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1640 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1641
1642 if (skb->protocol != htons(ETH_P_IP)) {
1643 /* Not IP (i.e. ARP). Do not create route, if it is
1644 * invalid for proxy arp. DNAT routes are always valid.
1645 *
1646 * Proxy arp feature have been extended to allow, ARP
1647 * replies back to the same interface, to support
1648 * Private VLAN switch technologies. See arp.c.
1649 */
1650 if (out_dev == in_dev &&
1651 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1652 err = -EINVAL;
1653 goto cleanup;
1654 }
1655 }
1656
1657 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1658 if (do_cache) {
1659 if (fnhe) {
1660 rth = rcu_dereference(fnhe->fnhe_rth_input);
1661 if (rth && rth->dst.expires &&
1662 time_after(jiffies, rth->dst.expires)) {
1663 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1664 fnhe = NULL;
1665 } else {
1666 goto rt_cache;
1667 }
1668 }
1669
1670 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1671
1672 rt_cache:
1673 if (rt_cache_valid(rth)) {
1674 skb_dst_set_noref(skb, &rth->dst);
1675 goto out;
1676 }
1677 }
1678
1679 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1680 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1681 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1682 if (!rth) {
1683 err = -ENOBUFS;
1684 goto cleanup;
1685 }
1686
1687 rth->rt_is_input = 1;
1688 if (res->table)
1689 rth->rt_table_id = res->table->tb_id;
1690 RT_CACHE_STAT_INC(in_slow_tot);
1691
1692 rth->dst.input = ip_forward;
1693
1694 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1695 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1696 rth->dst.lwtstate->orig_output = rth->dst.output;
1697 rth->dst.output = lwtunnel_output;
1698 }
1699 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1700 rth->dst.lwtstate->orig_input = rth->dst.input;
1701 rth->dst.input = lwtunnel_input;
1702 }
1703 skb_dst_set(skb, &rth->dst);
1704 out:
1705 err = 0;
1706 cleanup:
1707 return err;
1708 }
1709
1710 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1711
1712 /* To make ICMP packets follow the right flow, the multipath hash is
1713 * calculated from the inner IP addresses in reverse order.
1714 */
1715 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1716 {
1717 const struct iphdr *outer_iph = ip_hdr(skb);
1718 struct icmphdr _icmph;
1719 const struct icmphdr *icmph;
1720 struct iphdr _inner_iph;
1721 const struct iphdr *inner_iph;
1722
1723 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1724 goto standard_hash;
1725
1726 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1727 &_icmph);
1728 if (!icmph)
1729 goto standard_hash;
1730
1731 if (icmph->type != ICMP_DEST_UNREACH &&
1732 icmph->type != ICMP_REDIRECT &&
1733 icmph->type != ICMP_TIME_EXCEEDED &&
1734 icmph->type != ICMP_PARAMETERPROB) {
1735 goto standard_hash;
1736 }
1737
1738 inner_iph = skb_header_pointer(skb,
1739 outer_iph->ihl * 4 + sizeof(_icmph),
1740 sizeof(_inner_iph), &_inner_iph);
1741 if (!inner_iph)
1742 goto standard_hash;
1743
1744 return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1745
1746 standard_hash:
1747 return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1748 }
1749
1750 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1751
1752 static int ip_mkroute_input(struct sk_buff *skb,
1753 struct fib_result *res,
1754 const struct flowi4 *fl4,
1755 struct in_device *in_dev,
1756 __be32 daddr, __be32 saddr, u32 tos)
1757 {
1758 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1759 if (res->fi && res->fi->fib_nhs > 1) {
1760 int h;
1761
1762 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1763 h = ip_multipath_icmp_hash(skb);
1764 else
1765 h = fib_multipath_hash(saddr, daddr);
1766 fib_select_multipath(res, h);
1767 }
1768 #endif
1769
1770 /* create a routing cache entry */
1771 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1772 }
1773
1774 /*
1775 * NOTE. We drop all the packets that has local source
1776 * addresses, because every properly looped back packet
1777 * must have correct destination already attached by output routine.
1778 *
1779 * Such approach solves two big problems:
1780 * 1. Not simplex devices are handled properly.
1781 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1782 * called with rcu_read_lock()
1783 */
1784
1785 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1786 u8 tos, struct net_device *dev)
1787 {
1788 struct fib_result res;
1789 struct in_device *in_dev = __in_dev_get_rcu(dev);
1790 struct ip_tunnel_info *tun_info;
1791 struct flowi4 fl4;
1792 unsigned int flags = 0;
1793 u32 itag = 0;
1794 struct rtable *rth;
1795 int err = -EINVAL;
1796 struct net *net = dev_net(dev);
1797 bool do_cache;
1798
1799 /* IP on this device is disabled. */
1800
1801 if (!in_dev)
1802 goto out;
1803
1804 /* Check for the most weird martians, which can be not detected
1805 by fib_lookup.
1806 */
1807
1808 tun_info = skb_tunnel_info(skb);
1809 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1810 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1811 else
1812 fl4.flowi4_tun_key.tun_id = 0;
1813 skb_dst_drop(skb);
1814
1815 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1816 goto martian_source;
1817
1818 res.fi = NULL;
1819 res.table = NULL;
1820 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1821 goto brd_input;
1822
1823 /* Accept zero addresses only to limited broadcast;
1824 * I even do not know to fix it or not. Waiting for complains :-)
1825 */
1826 if (ipv4_is_zeronet(saddr))
1827 goto martian_source;
1828
1829 if (ipv4_is_zeronet(daddr))
1830 goto martian_destination;
1831
1832 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1833 * and call it once if daddr or/and saddr are loopback addresses
1834 */
1835 if (ipv4_is_loopback(daddr)) {
1836 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1837 goto martian_destination;
1838 } else if (ipv4_is_loopback(saddr)) {
1839 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1840 goto martian_source;
1841 }
1842
1843 /*
1844 * Now we are ready to route packet.
1845 */
1846 fl4.flowi4_oif = 0;
1847 fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1848 fl4.flowi4_mark = skb->mark;
1849 fl4.flowi4_tos = tos;
1850 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1851 fl4.flowi4_flags = 0;
1852 fl4.daddr = daddr;
1853 fl4.saddr = saddr;
1854 err = fib_lookup(net, &fl4, &res, 0);
1855 if (err != 0) {
1856 if (!IN_DEV_FORWARD(in_dev))
1857 err = -EHOSTUNREACH;
1858 goto no_route;
1859 }
1860
1861 if (res.type == RTN_BROADCAST)
1862 goto brd_input;
1863
1864 if (res.type == RTN_LOCAL) {
1865 err = fib_validate_source(skb, saddr, daddr, tos,
1866 0, dev, in_dev, &itag);
1867 if (err < 0)
1868 goto martian_source;
1869 goto local_input;
1870 }
1871
1872 if (!IN_DEV_FORWARD(in_dev)) {
1873 err = -EHOSTUNREACH;
1874 goto no_route;
1875 }
1876 if (res.type != RTN_UNICAST)
1877 goto martian_destination;
1878
1879 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1880 out: return err;
1881
1882 brd_input:
1883 if (skb->protocol != htons(ETH_P_IP))
1884 goto e_inval;
1885
1886 if (!ipv4_is_zeronet(saddr)) {
1887 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1888 in_dev, &itag);
1889 if (err < 0)
1890 goto martian_source;
1891 }
1892 flags |= RTCF_BROADCAST;
1893 res.type = RTN_BROADCAST;
1894 RT_CACHE_STAT_INC(in_brd);
1895
1896 local_input:
1897 do_cache = false;
1898 if (res.fi) {
1899 if (!itag) {
1900 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1901 if (rt_cache_valid(rth)) {
1902 skb_dst_set_noref(skb, &rth->dst);
1903 err = 0;
1904 goto out;
1905 }
1906 do_cache = true;
1907 }
1908 }
1909
1910 rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1911 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1912 if (!rth)
1913 goto e_nobufs;
1914
1915 rth->dst.output= ip_rt_bug;
1916 #ifdef CONFIG_IP_ROUTE_CLASSID
1917 rth->dst.tclassid = itag;
1918 #endif
1919 rth->rt_is_input = 1;
1920 if (res.table)
1921 rth->rt_table_id = res.table->tb_id;
1922
1923 RT_CACHE_STAT_INC(in_slow_tot);
1924 if (res.type == RTN_UNREACHABLE) {
1925 rth->dst.input= ip_error;
1926 rth->dst.error= -err;
1927 rth->rt_flags &= ~RTCF_LOCAL;
1928 }
1929 if (do_cache) {
1930 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1931 rth->dst.flags |= DST_NOCACHE;
1932 rt_add_uncached_list(rth);
1933 }
1934 }
1935 skb_dst_set(skb, &rth->dst);
1936 err = 0;
1937 goto out;
1938
1939 no_route:
1940 RT_CACHE_STAT_INC(in_no_route);
1941 res.type = RTN_UNREACHABLE;
1942 res.fi = NULL;
1943 res.table = NULL;
1944 goto local_input;
1945
1946 /*
1947 * Do not cache martian addresses: they should be logged (RFC1812)
1948 */
1949 martian_destination:
1950 RT_CACHE_STAT_INC(in_martian_dst);
1951 #ifdef CONFIG_IP_ROUTE_VERBOSE
1952 if (IN_DEV_LOG_MARTIANS(in_dev))
1953 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1954 &daddr, &saddr, dev->name);
1955 #endif
1956
1957 e_inval:
1958 err = -EINVAL;
1959 goto out;
1960
1961 e_nobufs:
1962 err = -ENOBUFS;
1963 goto out;
1964
1965 martian_source:
1966 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1967 goto out;
1968 }
1969
1970 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1971 u8 tos, struct net_device *dev)
1972 {
1973 int res;
1974
1975 tos &= IPTOS_RT_MASK;
1976 rcu_read_lock();
1977
1978 /* Multicast recognition logic is moved from route cache to here.
1979 The problem was that too many Ethernet cards have broken/missing
1980 hardware multicast filters :-( As result the host on multicasting
1981 network acquires a lot of useless route cache entries, sort of
1982 SDR messages from all the world. Now we try to get rid of them.
1983 Really, provided software IP multicast filter is organized
1984 reasonably (at least, hashed), it does not result in a slowdown
1985 comparing with route cache reject entries.
1986 Note, that multicast routers are not affected, because
1987 route cache entry is created eventually.
1988 */
1989 if (ipv4_is_multicast(daddr)) {
1990 struct in_device *in_dev = __in_dev_get_rcu(dev);
1991
1992 if (in_dev) {
1993 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1994 ip_hdr(skb)->protocol);
1995 if (our
1996 #ifdef CONFIG_IP_MROUTE
1997 ||
1998 (!ipv4_is_local_multicast(daddr) &&
1999 IN_DEV_MFORWARD(in_dev))
2000 #endif
2001 ) {
2002 int res = ip_route_input_mc(skb, daddr, saddr,
2003 tos, dev, our);
2004 rcu_read_unlock();
2005 return res;
2006 }
2007 }
2008 rcu_read_unlock();
2009 return -EINVAL;
2010 }
2011 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2012 rcu_read_unlock();
2013 return res;
2014 }
2015 EXPORT_SYMBOL(ip_route_input_noref);
2016
2017 /* called with rcu_read_lock() */
2018 static struct rtable *__mkroute_output(const struct fib_result *res,
2019 const struct flowi4 *fl4, int orig_oif,
2020 struct net_device *dev_out,
2021 unsigned int flags)
2022 {
2023 struct fib_info *fi = res->fi;
2024 struct fib_nh_exception *fnhe;
2025 struct in_device *in_dev;
2026 u16 type = res->type;
2027 struct rtable *rth;
2028 bool do_cache;
2029
2030 in_dev = __in_dev_get_rcu(dev_out);
2031 if (!in_dev)
2032 return ERR_PTR(-EINVAL);
2033
2034 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2035 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2036 return ERR_PTR(-EINVAL);
2037
2038 if (ipv4_is_lbcast(fl4->daddr))
2039 type = RTN_BROADCAST;
2040 else if (ipv4_is_multicast(fl4->daddr))
2041 type = RTN_MULTICAST;
2042 else if (ipv4_is_zeronet(fl4->daddr))
2043 return ERR_PTR(-EINVAL);
2044
2045 if (dev_out->flags & IFF_LOOPBACK)
2046 flags |= RTCF_LOCAL;
2047
2048 do_cache = true;
2049 if (type == RTN_BROADCAST) {
2050 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2051 fi = NULL;
2052 } else if (type == RTN_MULTICAST) {
2053 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2054 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2055 fl4->flowi4_proto))
2056 flags &= ~RTCF_LOCAL;
2057 else
2058 do_cache = false;
2059 /* If multicast route do not exist use
2060 * default one, but do not gateway in this case.
2061 * Yes, it is hack.
2062 */
2063 if (fi && res->prefixlen < 4)
2064 fi = NULL;
2065 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2066 (orig_oif != dev_out->ifindex)) {
2067 /* For local routes that require a particular output interface
2068 * we do not want to cache the result. Caching the result
2069 * causes incorrect behaviour when there are multiple source
2070 * addresses on the interface, the end result being that if the
2071 * intended recipient is waiting on that interface for the
2072 * packet he won't receive it because it will be delivered on
2073 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2074 * be set to the loopback interface as well.
2075 */
2076 fi = NULL;
2077 }
2078
2079 fnhe = NULL;
2080 do_cache &= fi != NULL;
2081 if (do_cache) {
2082 struct rtable __rcu **prth;
2083 struct fib_nh *nh = &FIB_RES_NH(*res);
2084
2085 fnhe = find_exception(nh, fl4->daddr);
2086 if (fnhe) {
2087 prth = &fnhe->fnhe_rth_output;
2088 rth = rcu_dereference(*prth);
2089 if (rth && rth->dst.expires &&
2090 time_after(jiffies, rth->dst.expires)) {
2091 ip_del_fnhe(nh, fl4->daddr);
2092 fnhe = NULL;
2093 } else {
2094 goto rt_cache;
2095 }
2096 }
2097
2098 if (unlikely(fl4->flowi4_flags &
2099 FLOWI_FLAG_KNOWN_NH &&
2100 !(nh->nh_gw &&
2101 nh->nh_scope == RT_SCOPE_LINK))) {
2102 do_cache = false;
2103 goto add;
2104 }
2105 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2106 rth = rcu_dereference(*prth);
2107
2108 rt_cache:
2109 if (rt_cache_valid(rth)) {
2110 dst_hold(&rth->dst);
2111 return rth;
2112 }
2113 }
2114
2115 add:
2116 rth = rt_dst_alloc(dev_out, flags, type,
2117 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2118 IN_DEV_CONF_GET(in_dev, NOXFRM),
2119 do_cache);
2120 if (!rth)
2121 return ERR_PTR(-ENOBUFS);
2122
2123 rth->rt_iif = orig_oif ? : 0;
2124 if (res->table)
2125 rth->rt_table_id = res->table->tb_id;
2126
2127 RT_CACHE_STAT_INC(out_slow_tot);
2128
2129 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2130 if (flags & RTCF_LOCAL &&
2131 !(dev_out->flags & IFF_LOOPBACK)) {
2132 rth->dst.output = ip_mc_output;
2133 RT_CACHE_STAT_INC(out_slow_mc);
2134 }
2135 #ifdef CONFIG_IP_MROUTE
2136 if (type == RTN_MULTICAST) {
2137 if (IN_DEV_MFORWARD(in_dev) &&
2138 !ipv4_is_local_multicast(fl4->daddr)) {
2139 rth->dst.input = ip_mr_input;
2140 rth->dst.output = ip_mc_output;
2141 }
2142 }
2143 #endif
2144 }
2145
2146 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2147 if (lwtunnel_output_redirect(rth->dst.lwtstate))
2148 rth->dst.output = lwtunnel_output;
2149
2150 return rth;
2151 }
2152
2153 /*
2154 * Major route resolver routine.
2155 */
2156
2157 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2158 int mp_hash)
2159 {
2160 struct net_device *dev_out = NULL;
2161 __u8 tos = RT_FL_TOS(fl4);
2162 unsigned int flags = 0;
2163 struct fib_result res;
2164 struct rtable *rth;
2165 int orig_oif;
2166 int err = -ENETUNREACH;
2167
2168 res.tclassid = 0;
2169 res.fi = NULL;
2170 res.table = NULL;
2171
2172 orig_oif = fl4->flowi4_oif;
2173
2174 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2175 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2176 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2177 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2178
2179 rcu_read_lock();
2180 if (fl4->saddr) {
2181 rth = ERR_PTR(-EINVAL);
2182 if (ipv4_is_multicast(fl4->saddr) ||
2183 ipv4_is_lbcast(fl4->saddr) ||
2184 ipv4_is_zeronet(fl4->saddr))
2185 goto out;
2186
2187 /* I removed check for oif == dev_out->oif here.
2188 It was wrong for two reasons:
2189 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2190 is assigned to multiple interfaces.
2191 2. Moreover, we are allowed to send packets with saddr
2192 of another iface. --ANK
2193 */
2194
2195 if (fl4->flowi4_oif == 0 &&
2196 (ipv4_is_multicast(fl4->daddr) ||
2197 ipv4_is_lbcast(fl4->daddr))) {
2198 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2199 dev_out = __ip_dev_find(net, fl4->saddr, false);
2200 if (!dev_out)
2201 goto out;
2202
2203 /* Special hack: user can direct multicasts
2204 and limited broadcast via necessary interface
2205 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2206 This hack is not just for fun, it allows
2207 vic,vat and friends to work.
2208 They bind socket to loopback, set ttl to zero
2209 and expect that it will work.
2210 From the viewpoint of routing cache they are broken,
2211 because we are not allowed to build multicast path
2212 with loopback source addr (look, routing cache
2213 cannot know, that ttl is zero, so that packet
2214 will not leave this host and route is valid).
2215 Luckily, this hack is good workaround.
2216 */
2217
2218 fl4->flowi4_oif = dev_out->ifindex;
2219 goto make_route;
2220 }
2221
2222 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2223 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2224 if (!__ip_dev_find(net, fl4->saddr, false))
2225 goto out;
2226 }
2227 }
2228
2229
2230 if (fl4->flowi4_oif) {
2231 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2232 rth = ERR_PTR(-ENODEV);
2233 if (!dev_out)
2234 goto out;
2235
2236 /* RACE: Check return value of inet_select_addr instead. */
2237 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2238 rth = ERR_PTR(-ENETUNREACH);
2239 goto out;
2240 }
2241 if (ipv4_is_local_multicast(fl4->daddr) ||
2242 ipv4_is_lbcast(fl4->daddr) ||
2243 fl4->flowi4_proto == IPPROTO_IGMP) {
2244 if (!fl4->saddr)
2245 fl4->saddr = inet_select_addr(dev_out, 0,
2246 RT_SCOPE_LINK);
2247 goto make_route;
2248 }
2249 if (!fl4->saddr) {
2250 if (ipv4_is_multicast(fl4->daddr))
2251 fl4->saddr = inet_select_addr(dev_out, 0,
2252 fl4->flowi4_scope);
2253 else if (!fl4->daddr)
2254 fl4->saddr = inet_select_addr(dev_out, 0,
2255 RT_SCOPE_HOST);
2256 }
2257
2258 rth = l3mdev_get_rtable(dev_out, fl4);
2259 if (rth)
2260 goto out;
2261 }
2262
2263 if (!fl4->daddr) {
2264 fl4->daddr = fl4->saddr;
2265 if (!fl4->daddr)
2266 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2267 dev_out = net->loopback_dev;
2268 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2269 res.type = RTN_LOCAL;
2270 flags |= RTCF_LOCAL;
2271 goto make_route;
2272 }
2273
2274 err = fib_lookup(net, fl4, &res, 0);
2275 if (err) {
2276 res.fi = NULL;
2277 res.table = NULL;
2278 if (fl4->flowi4_oif &&
2279 !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2280 /* Apparently, routing tables are wrong. Assume,
2281 that the destination is on link.
2282
2283 WHY? DW.
2284 Because we are allowed to send to iface
2285 even if it has NO routes and NO assigned
2286 addresses. When oif is specified, routing
2287 tables are looked up with only one purpose:
2288 to catch if destination is gatewayed, rather than
2289 direct. Moreover, if MSG_DONTROUTE is set,
2290 we send packet, ignoring both routing tables
2291 and ifaddr state. --ANK
2292
2293
2294 We could make it even if oif is unknown,
2295 likely IPv6, but we do not.
2296 */
2297
2298 if (fl4->saddr == 0)
2299 fl4->saddr = inet_select_addr(dev_out, 0,
2300 RT_SCOPE_LINK);
2301 res.type = RTN_UNICAST;
2302 goto make_route;
2303 }
2304 rth = ERR_PTR(err);
2305 goto out;
2306 }
2307
2308 if (res.type == RTN_LOCAL) {
2309 if (!fl4->saddr) {
2310 if (res.fi->fib_prefsrc)
2311 fl4->saddr = res.fi->fib_prefsrc;
2312 else
2313 fl4->saddr = fl4->daddr;
2314 }
2315 dev_out = net->loopback_dev;
2316 fl4->flowi4_oif = dev_out->ifindex;
2317 flags |= RTCF_LOCAL;
2318 goto make_route;
2319 }
2320
2321 fib_select_path(net, &res, fl4, mp_hash);
2322
2323 dev_out = FIB_RES_DEV(res);
2324 fl4->flowi4_oif = dev_out->ifindex;
2325
2326
2327 make_route:
2328 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2329
2330 out:
2331 rcu_read_unlock();
2332 return rth;
2333 }
2334 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2335
2336 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2337 {
2338 return NULL;
2339 }
2340
2341 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2342 {
2343 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2344
2345 return mtu ? : dst->dev->mtu;
2346 }
2347
2348 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2349 struct sk_buff *skb, u32 mtu)
2350 {
2351 }
2352
2353 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2354 struct sk_buff *skb)
2355 {
2356 }
2357
2358 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2359 unsigned long old)
2360 {
2361 return NULL;
2362 }
2363
2364 static struct dst_ops ipv4_dst_blackhole_ops = {
2365 .family = AF_INET,
2366 .check = ipv4_blackhole_dst_check,
2367 .mtu = ipv4_blackhole_mtu,
2368 .default_advmss = ipv4_default_advmss,
2369 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2370 .redirect = ipv4_rt_blackhole_redirect,
2371 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2372 .neigh_lookup = ipv4_neigh_lookup,
2373 };
2374
2375 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2376 {
2377 struct rtable *ort = (struct rtable *) dst_orig;
2378 struct rtable *rt;
2379
2380 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2381 if (rt) {
2382 struct dst_entry *new = &rt->dst;
2383
2384 new->__use = 1;
2385 new->input = dst_discard;
2386 new->output = dst_discard_out;
2387
2388 new->dev = ort->dst.dev;
2389 if (new->dev)
2390 dev_hold(new->dev);
2391
2392 rt->rt_is_input = ort->rt_is_input;
2393 rt->rt_iif = ort->rt_iif;
2394 rt->rt_pmtu = ort->rt_pmtu;
2395
2396 rt->rt_genid = rt_genid_ipv4(net);
2397 rt->rt_flags = ort->rt_flags;
2398 rt->rt_type = ort->rt_type;
2399 rt->rt_gateway = ort->rt_gateway;
2400 rt->rt_uses_gateway = ort->rt_uses_gateway;
2401
2402 INIT_LIST_HEAD(&rt->rt_uncached);
2403 dst_free(new);
2404 }
2405
2406 dst_release(dst_orig);
2407
2408 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2409 }
2410
2411 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2412 const struct sock *sk)
2413 {
2414 struct rtable *rt = __ip_route_output_key(net, flp4);
2415
2416 if (IS_ERR(rt))
2417 return rt;
2418
2419 if (flp4->flowi4_proto)
2420 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2421 flowi4_to_flowi(flp4),
2422 sk, 0);
2423
2424 return rt;
2425 }
2426 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2427
2428 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2429 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2430 u32 seq, int event, int nowait, unsigned int flags)
2431 {
2432 struct rtable *rt = skb_rtable(skb);
2433 struct rtmsg *r;
2434 struct nlmsghdr *nlh;
2435 unsigned long expires = 0;
2436 u32 error;
2437 u32 metrics[RTAX_MAX];
2438
2439 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2440 if (!nlh)
2441 return -EMSGSIZE;
2442
2443 r = nlmsg_data(nlh);
2444 r->rtm_family = AF_INET;
2445 r->rtm_dst_len = 32;
2446 r->rtm_src_len = 0;
2447 r->rtm_tos = fl4->flowi4_tos;
2448 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2449 if (nla_put_u32(skb, RTA_TABLE, table_id))
2450 goto nla_put_failure;
2451 r->rtm_type = rt->rt_type;
2452 r->rtm_scope = RT_SCOPE_UNIVERSE;
2453 r->rtm_protocol = RTPROT_UNSPEC;
2454 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2455 if (rt->rt_flags & RTCF_NOTIFY)
2456 r->rtm_flags |= RTM_F_NOTIFY;
2457 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2458 r->rtm_flags |= RTCF_DOREDIRECT;
2459
2460 if (nla_put_in_addr(skb, RTA_DST, dst))
2461 goto nla_put_failure;
2462 if (src) {
2463 r->rtm_src_len = 32;
2464 if (nla_put_in_addr(skb, RTA_SRC, src))
2465 goto nla_put_failure;
2466 }
2467 if (rt->dst.dev &&
2468 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2469 goto nla_put_failure;
2470 #ifdef CONFIG_IP_ROUTE_CLASSID
2471 if (rt->dst.tclassid &&
2472 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2473 goto nla_put_failure;
2474 #endif
2475 if (!rt_is_input_route(rt) &&
2476 fl4->saddr != src) {
2477 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2478 goto nla_put_failure;
2479 }
2480 if (rt->rt_uses_gateway &&
2481 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2482 goto nla_put_failure;
2483
2484 expires = rt->dst.expires;
2485 if (expires) {
2486 unsigned long now = jiffies;
2487
2488 if (time_before(now, expires))
2489 expires -= now;
2490 else
2491 expires = 0;
2492 }
2493
2494 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2495 if (rt->rt_pmtu && expires)
2496 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2497 if (rtnetlink_put_metrics(skb, metrics) < 0)
2498 goto nla_put_failure;
2499
2500 if (fl4->flowi4_mark &&
2501 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2502 goto nla_put_failure;
2503
2504 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2505 nla_put_u32(skb, RTA_UID,
2506 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2507 goto nla_put_failure;
2508
2509 error = rt->dst.error;
2510
2511 if (rt_is_input_route(rt)) {
2512 #ifdef CONFIG_IP_MROUTE
2513 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2514 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2515 int err = ipmr_get_route(net, skb,
2516 fl4->saddr, fl4->daddr,
2517 r, nowait, portid);
2518
2519 if (err <= 0) {
2520 if (!nowait) {
2521 if (err == 0)
2522 return 0;
2523 goto nla_put_failure;
2524 } else {
2525 if (err == -EMSGSIZE)
2526 goto nla_put_failure;
2527 error = err;
2528 }
2529 }
2530 } else
2531 #endif
2532 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2533 goto nla_put_failure;
2534 }
2535
2536 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2537 goto nla_put_failure;
2538
2539 nlmsg_end(skb, nlh);
2540 return 0;
2541
2542 nla_put_failure:
2543 nlmsg_cancel(skb, nlh);
2544 return -EMSGSIZE;
2545 }
2546
2547 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2548 {
2549 struct net *net = sock_net(in_skb->sk);
2550 struct rtmsg *rtm;
2551 struct nlattr *tb[RTA_MAX+1];
2552 struct rtable *rt = NULL;
2553 struct flowi4 fl4;
2554 __be32 dst = 0;
2555 __be32 src = 0;
2556 u32 iif;
2557 int err;
2558 int mark;
2559 struct sk_buff *skb;
2560 u32 table_id = RT_TABLE_MAIN;
2561 kuid_t uid;
2562
2563 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2564 if (err < 0)
2565 goto errout;
2566
2567 rtm = nlmsg_data(nlh);
2568
2569 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2570 if (!skb) {
2571 err = -ENOBUFS;
2572 goto errout;
2573 }
2574
2575 /* Reserve room for dummy headers, this skb can pass
2576 through good chunk of routing engine.
2577 */
2578 skb_reset_mac_header(skb);
2579 skb_reset_network_header(skb);
2580
2581 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2582 ip_hdr(skb)->protocol = IPPROTO_UDP;
2583 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2584
2585 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2586 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2587 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2588 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2589 if (tb[RTA_UID])
2590 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2591 else
2592 uid = (iif ? INVALID_UID : current_uid());
2593
2594 memset(&fl4, 0, sizeof(fl4));
2595 fl4.daddr = dst;
2596 fl4.saddr = src;
2597 fl4.flowi4_tos = rtm->rtm_tos;
2598 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2599 fl4.flowi4_mark = mark;
2600 fl4.flowi4_uid = uid;
2601
2602 if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2603 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2604
2605 if (iif) {
2606 struct net_device *dev;
2607
2608 dev = __dev_get_by_index(net, iif);
2609 if (!dev) {
2610 err = -ENODEV;
2611 goto errout_free;
2612 }
2613
2614 skb->protocol = htons(ETH_P_IP);
2615 skb->dev = dev;
2616 skb->mark = mark;
2617 local_bh_disable();
2618 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2619 local_bh_enable();
2620
2621 rt = skb_rtable(skb);
2622 if (err == 0 && rt->dst.error)
2623 err = -rt->dst.error;
2624 } else {
2625 rt = ip_route_output_key(net, &fl4);
2626
2627 err = 0;
2628 if (IS_ERR(rt))
2629 err = PTR_ERR(rt);
2630 }
2631
2632 if (err)
2633 goto errout_free;
2634
2635 skb_dst_set(skb, &rt->dst);
2636 if (rtm->rtm_flags & RTM_F_NOTIFY)
2637 rt->rt_flags |= RTCF_NOTIFY;
2638
2639 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2640 table_id = rt->rt_table_id;
2641
2642 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2643 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2644 RTM_NEWROUTE, 0, 0);
2645 if (err < 0)
2646 goto errout_free;
2647
2648 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2649 errout:
2650 return err;
2651
2652 errout_free:
2653 kfree_skb(skb);
2654 goto errout;
2655 }
2656
2657 void ip_rt_multicast_event(struct in_device *in_dev)
2658 {
2659 rt_cache_flush(dev_net(in_dev->dev));
2660 }
2661
2662 #ifdef CONFIG_SYSCTL
2663 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2664 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2665 static int ip_rt_gc_elasticity __read_mostly = 8;
2666
2667 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2668 void __user *buffer,
2669 size_t *lenp, loff_t *ppos)
2670 {
2671 struct net *net = (struct net *)__ctl->extra1;
2672
2673 if (write) {
2674 rt_cache_flush(net);
2675 fnhe_genid_bump(net);
2676 return 0;
2677 }
2678
2679 return -EINVAL;
2680 }
2681
2682 static struct ctl_table ipv4_route_table[] = {
2683 {
2684 .procname = "gc_thresh",
2685 .data = &ipv4_dst_ops.gc_thresh,
2686 .maxlen = sizeof(int),
2687 .mode = 0644,
2688 .proc_handler = proc_dointvec,
2689 },
2690 {
2691 .procname = "max_size",
2692 .data = &ip_rt_max_size,
2693 .maxlen = sizeof(int),
2694 .mode = 0644,
2695 .proc_handler = proc_dointvec,
2696 },
2697 {
2698 /* Deprecated. Use gc_min_interval_ms */
2699
2700 .procname = "gc_min_interval",
2701 .data = &ip_rt_gc_min_interval,
2702 .maxlen = sizeof(int),
2703 .mode = 0644,
2704 .proc_handler = proc_dointvec_jiffies,
2705 },
2706 {
2707 .procname = "gc_min_interval_ms",
2708 .data = &ip_rt_gc_min_interval,
2709 .maxlen = sizeof(int),
2710 .mode = 0644,
2711 .proc_handler = proc_dointvec_ms_jiffies,
2712 },
2713 {
2714 .procname = "gc_timeout",
2715 .data = &ip_rt_gc_timeout,
2716 .maxlen = sizeof(int),
2717 .mode = 0644,
2718 .proc_handler = proc_dointvec_jiffies,
2719 },
2720 {
2721 .procname = "gc_interval",
2722 .data = &ip_rt_gc_interval,
2723 .maxlen = sizeof(int),
2724 .mode = 0644,
2725 .proc_handler = proc_dointvec_jiffies,
2726 },
2727 {
2728 .procname = "redirect_load",
2729 .data = &ip_rt_redirect_load,
2730 .maxlen = sizeof(int),
2731 .mode = 0644,
2732 .proc_handler = proc_dointvec,
2733 },
2734 {
2735 .procname = "redirect_number",
2736 .data = &ip_rt_redirect_number,
2737 .maxlen = sizeof(int),
2738 .mode = 0644,
2739 .proc_handler = proc_dointvec,
2740 },
2741 {
2742 .procname = "redirect_silence",
2743 .data = &ip_rt_redirect_silence,
2744 .maxlen = sizeof(int),
2745 .mode = 0644,
2746 .proc_handler = proc_dointvec,
2747 },
2748 {
2749 .procname = "error_cost",
2750 .data = &ip_rt_error_cost,
2751 .maxlen = sizeof(int),
2752 .mode = 0644,
2753 .proc_handler = proc_dointvec,
2754 },
2755 {
2756 .procname = "error_burst",
2757 .data = &ip_rt_error_burst,
2758 .maxlen = sizeof(int),
2759 .mode = 0644,
2760 .proc_handler = proc_dointvec,
2761 },
2762 {
2763 .procname = "gc_elasticity",
2764 .data = &ip_rt_gc_elasticity,
2765 .maxlen = sizeof(int),
2766 .mode = 0644,
2767 .proc_handler = proc_dointvec,
2768 },
2769 {
2770 .procname = "mtu_expires",
2771 .data = &ip_rt_mtu_expires,
2772 .maxlen = sizeof(int),
2773 .mode = 0644,
2774 .proc_handler = proc_dointvec_jiffies,
2775 },
2776 {
2777 .procname = "min_pmtu",
2778 .data = &ip_rt_min_pmtu,
2779 .maxlen = sizeof(int),
2780 .mode = 0644,
2781 .proc_handler = proc_dointvec,
2782 },
2783 {
2784 .procname = "min_adv_mss",
2785 .data = &ip_rt_min_advmss,
2786 .maxlen = sizeof(int),
2787 .mode = 0644,
2788 .proc_handler = proc_dointvec,
2789 },
2790 { }
2791 };
2792
2793 static struct ctl_table ipv4_route_flush_table[] = {
2794 {
2795 .procname = "flush",
2796 .maxlen = sizeof(int),
2797 .mode = 0200,
2798 .proc_handler = ipv4_sysctl_rtcache_flush,
2799 },
2800 { },
2801 };
2802
2803 static __net_init int sysctl_route_net_init(struct net *net)
2804 {
2805 struct ctl_table *tbl;
2806
2807 tbl = ipv4_route_flush_table;
2808 if (!net_eq(net, &init_net)) {
2809 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2810 if (!tbl)
2811 goto err_dup;
2812
2813 /* Don't export sysctls to unprivileged users */
2814 if (net->user_ns != &init_user_ns)
2815 tbl[0].procname = NULL;
2816 }
2817 tbl[0].extra1 = net;
2818
2819 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2820 if (!net->ipv4.route_hdr)
2821 goto err_reg;
2822 return 0;
2823
2824 err_reg:
2825 if (tbl != ipv4_route_flush_table)
2826 kfree(tbl);
2827 err_dup:
2828 return -ENOMEM;
2829 }
2830
2831 static __net_exit void sysctl_route_net_exit(struct net *net)
2832 {
2833 struct ctl_table *tbl;
2834
2835 tbl = net->ipv4.route_hdr->ctl_table_arg;
2836 unregister_net_sysctl_table(net->ipv4.route_hdr);
2837 BUG_ON(tbl == ipv4_route_flush_table);
2838 kfree(tbl);
2839 }
2840
2841 static __net_initdata struct pernet_operations sysctl_route_ops = {
2842 .init = sysctl_route_net_init,
2843 .exit = sysctl_route_net_exit,
2844 };
2845 #endif
2846
2847 static __net_init int rt_genid_init(struct net *net)
2848 {
2849 atomic_set(&net->ipv4.rt_genid, 0);
2850 atomic_set(&net->fnhe_genid, 0);
2851 get_random_bytes(&net->ipv4.dev_addr_genid,
2852 sizeof(net->ipv4.dev_addr_genid));
2853 return 0;
2854 }
2855
2856 static __net_initdata struct pernet_operations rt_genid_ops = {
2857 .init = rt_genid_init,
2858 };
2859
2860 static int __net_init ipv4_inetpeer_init(struct net *net)
2861 {
2862 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2863
2864 if (!bp)
2865 return -ENOMEM;
2866 inet_peer_base_init(bp);
2867 net->ipv4.peers = bp;
2868 return 0;
2869 }
2870
2871 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2872 {
2873 struct inet_peer_base *bp = net->ipv4.peers;
2874
2875 net->ipv4.peers = NULL;
2876 inetpeer_invalidate_tree(bp);
2877 kfree(bp);
2878 }
2879
2880 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2881 .init = ipv4_inetpeer_init,
2882 .exit = ipv4_inetpeer_exit,
2883 };
2884
2885 #ifdef CONFIG_IP_ROUTE_CLASSID
2886 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2887 #endif /* CONFIG_IP_ROUTE_CLASSID */
2888
2889 int __init ip_rt_init(void)
2890 {
2891 int rc = 0;
2892 int cpu;
2893
2894 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2895 if (!ip_idents)
2896 panic("IP: failed to allocate ip_idents\n");
2897
2898 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2899
2900 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2901 if (!ip_tstamps)
2902 panic("IP: failed to allocate ip_tstamps\n");
2903
2904 for_each_possible_cpu(cpu) {
2905 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2906
2907 INIT_LIST_HEAD(&ul->head);
2908 spin_lock_init(&ul->lock);
2909 }
2910 #ifdef CONFIG_IP_ROUTE_CLASSID
2911 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2912 if (!ip_rt_acct)
2913 panic("IP: failed to allocate ip_rt_acct\n");
2914 #endif
2915
2916 ipv4_dst_ops.kmem_cachep =
2917 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2918 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2919
2920 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2921
2922 if (dst_entries_init(&ipv4_dst_ops) < 0)
2923 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2924
2925 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2926 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2927
2928 ipv4_dst_ops.gc_thresh = ~0;
2929 ip_rt_max_size = INT_MAX;
2930
2931 devinet_init();
2932 ip_fib_init();
2933
2934 if (ip_rt_proc_init())
2935 pr_err("Unable to create route proc files\n");
2936 #ifdef CONFIG_XFRM
2937 xfrm_init();
2938 xfrm4_init();
2939 #endif
2940 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2941
2942 #ifdef CONFIG_SYSCTL
2943 register_pernet_subsys(&sysctl_route_ops);
2944 #endif
2945 register_pernet_subsys(&rt_genid_ops);
2946 register_pernet_subsys(&ipv4_inetpeer_ops);
2947 return rc;
2948 }
2949
2950 #ifdef CONFIG_SYSCTL
2951 /*
2952 * We really need to sanitize the damn ipv4 init order, then all
2953 * this nonsense will go away.
2954 */
2955 void __init ip_static_sysctl_init(void)
2956 {
2957 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2958 }
2959 #endif