defconfig: exynos9610: Re-add dropped Wi-Fi AP options lost
[GitHub/LineageOS/android_kernel_motorola_exynos9610.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
3bcf69f8 131static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
3bcf69f8
SD
135
136static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
137
1da177e4
LT
138/*
139 * Interface to generic destination cache.
140 */
141
142static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 143static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 144static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
147static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb, u32 mtu);
149static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150 struct sk_buff *skb);
caacf05e 151static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 152
62fa8a84
DM
153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
31248731
DM
155 WARN_ON(1);
156 return NULL;
62fa8a84
DM
157}
158
f894cbf8
DM
159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
63fca65d 162static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 163
1da177e4
LT
164static struct dst_ops ipv4_dst_ops = {
165 .family = AF_INET,
1da177e4 166 .check = ipv4_dst_check,
0dbaee3b 167 .default_advmss = ipv4_default_advmss,
ebb762f2 168 .mtu = ipv4_mtu,
62fa8a84 169 .cow_metrics = ipv4_cow_metrics,
caacf05e 170 .destroy = ipv4_dst_destroy,
1da177e4
LT
171 .negative_advice = ipv4_negative_advice,
172 .link_failure = ipv4_link_failure,
173 .update_pmtu = ip_rt_update_pmtu,
e47a185b 174 .redirect = ip_do_redirect,
b92dacd4 175 .local_out = __ip_local_out,
d3aaeb38 176 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 177 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
178};
179
180#define ECN_OR_COST(class) TC_PRIO_##class
181
4839c52b 182const __u8 ip_tos2prio[16] = {
1da177e4 183 TC_PRIO_BESTEFFORT,
4a2b9c37 184 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199};
d4a96865 200EXPORT_SYMBOL(ip_tos2prio);
1da177e4 201
2f970d83 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 203#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 204
1da177e4 205#ifdef CONFIG_PROC_FS
1da177e4
LT
206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207{
29e75252 208 if (*pos)
89aef892 209 return NULL;
29e75252 210 return SEQ_START_TOKEN;
1da177e4
LT
211}
212
213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214{
1da177e4 215 ++*pos;
89aef892 216 return NULL;
1da177e4
LT
217}
218
219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220{
1da177e4
LT
221}
222
223static int rt_cache_seq_show(struct seq_file *seq, void *v)
224{
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
e905a9ed 230 return 0;
1da177e4
LT
231}
232
f690808e 233static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238};
239
240static int rt_cache_seq_open(struct inode *inode, struct file *file)
241{
89aef892 242 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
243}
244
9a32144e 245static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
89aef892 250 .release = seq_release,
1da177e4
LT
251};
252
253
254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255{
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
0f23174a 261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
2f970d83 265 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
266 }
267 return NULL;
268}
269
270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271{
272 int cpu;
273
0f23174a 274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
2f970d83 278 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
279 }
280 return NULL;
e905a9ed 281
1da177e4
LT
282}
283
284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285{
286
287}
288
289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290{
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
5bec0039 294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
295 return 0;
296 }
e905a9ed 297
1da177e4
LT
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 300 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 301 0, /* st->in_hit */
1da177e4
LT
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
0baf2b35 309 0, /* st->out_hit */
1da177e4 310 st->out_slow_tot,
e905a9ed 311 st->out_slow_mc,
1da177e4 312
0baf2b35
ED
313 0, /* st->gc_total */
314 0, /* st->gc_ignored */
315 0, /* st->gc_goal_miss */
316 0, /* st->gc_dst_overflow */
317 0, /* st->in_hlist_search */
318 0 /* st->out_hlist_search */
1da177e4
LT
319 );
320 return 0;
321}
322
f690808e 323static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328};
329
330
331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332{
333 return seq_open(file, &rt_cpu_seq_ops);
334}
335
9a32144e 336static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342};
343
c7066f70 344#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 345static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 346{
a661c419
AD
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
78c686e9
PE
362 }
363
a661c419
AD
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367}
78c686e9 368
a661c419
AD
369static int rt_acct_proc_open(struct inode *inode, struct file *file)
370{
371 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 372}
a661c419
AD
373
374static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
78c686e9 381#endif
107f1634 382
73b38711 383static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
384{
385 struct proc_dir_entry *pde;
386
d4beaa66
G
387 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
388 &rt_cache_seq_fops);
107f1634
PE
389 if (!pde)
390 goto err1;
391
77020720
WC
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
394 if (!pde)
395 goto err2;
396
c7066f70 397#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
399 if (!pde)
400 goto err3;
401#endif
402 return 0;
403
c7066f70 404#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
405err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407#endif
408err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410err1:
411 return -ENOMEM;
412}
73b38711
DL
413
414static void __net_exit ip_rt_do_proc_exit(struct net *net)
415{
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 418#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 419 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 420#endif
73b38711
DL
421}
422
423static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426};
427
428static int __init ip_rt_proc_init(void)
429{
430 return register_pernet_subsys(&ip_rt_proc_ops);
431}
432
107f1634 433#else
73b38711 434static inline int ip_rt_proc_init(void)
107f1634
PE
435{
436 return 0;
437}
1da177e4 438#endif /* CONFIG_PROC_FS */
e905a9ed 439
4331debc 440static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 441{
ca4c3fc2 442 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
443}
444
4ccfe6d4 445void rt_cache_flush(struct net *net)
1da177e4 446{
ca4c3fc2 447 rt_genid_bump_ipv4(net);
98376387
ED
448}
449
f894cbf8
DM
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
3769cffb 453{
d3aaeb38
DM
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
39232973 456 const struct rtable *rt;
3769cffb
DM
457 struct neighbour *n;
458
39232973 459 rt = (const struct rtable *) dst;
a263b309 460 if (rt->rt_gateway)
39232973 461 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 464
80703d26 465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
466 if (n)
467 return n;
32092ecf 468 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
469}
470
63fca65d
JA
471static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
472{
473 struct net_device *dev = dst->dev;
474 const __be32 *pkey = daddr;
475 const struct rtable *rt;
476
477 rt = (const struct rtable *)dst;
478 if (rt->rt_gateway)
479 pkey = (const __be32 *)&rt->rt_gateway;
480 else if (!daddr ||
481 (rt->rt_flags &
482 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
483 return;
484
485 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
486}
487
04ca6973 488#define IP_IDENTS_SZ 2048u
04ca6973 489
355b590c
ED
490static atomic_t *ip_idents __read_mostly;
491static u32 *ip_tstamps __read_mostly;
04ca6973
ED
492
493/* In order to protect privacy, we add a perturbation to identifiers
494 * if one generator is seldom used. This makes hard for an attacker
495 * to infer how many packets were sent between two points in time.
496 */
497u32 ip_idents_reserve(u32 hash, int segs)
498{
355b590c
ED
499 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
500 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
501 u32 old = ACCESS_ONCE(*p_tstamp);
04ca6973 502 u32 now = (u32)jiffies;
adb03115 503 u32 new, delta = 0;
04ca6973 504
355b590c 505 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
506 delta = prandom_u32_max(now - old);
507
adb03115
ED
508 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
509 do {
510 old = (u32)atomic_read(p_id);
511 new = old + delta + segs;
512 } while (atomic_cmpxchg(p_id, old, new) != old);
513
514 return new - segs;
04ca6973
ED
515}
516EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 517
b6a7719a 518void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 519{
73f156a6 520 u32 hash, id;
1da177e4 521
47aed2a9
ED
522 /* Note the following code is not safe, but this is okay. */
523 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
524 get_random_bytes(&net->ipv4.ip_id_key,
525 sizeof(net->ipv4.ip_id_key));
1da177e4 526
47aed2a9 527 hash = siphash_3u32((__force u32)iph->daddr,
04ca6973 528 (__force u32)iph->saddr,
47aed2a9
ED
529 iph->protocol,
530 &net->ipv4.ip_id_key);
73f156a6
ED
531 id = ip_idents_reserve(hash, segs);
532 iph->id = htons(id);
1da177e4 533}
4bc2f18b 534EXPORT_SYMBOL(__ip_select_ident);
1da177e4 535
e2d118a1
LC
536static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
537 const struct sock *sk,
4895c771
DM
538 const struct iphdr *iph,
539 int oif, u8 tos,
540 u8 prot, u32 mark, int flow_flags)
541{
542 if (sk) {
543 const struct inet_sock *inet = inet_sk(sk);
544
545 oif = sk->sk_bound_dev_if;
546 mark = sk->sk_mark;
547 tos = RT_CONN_FLAGS(sk);
548 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
549 }
550 flowi4_init_output(fl4, oif, mark, tos,
551 RT_SCOPE_UNIVERSE, prot,
552 flow_flags,
e2d118a1
LC
553 iph->daddr, iph->saddr, 0, 0,
554 sock_net_uid(net, sk));
4895c771
DM
555}
556
5abf7f7e
ED
557static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
558 const struct sock *sk)
4895c771 559{
d109e61b 560 const struct net *net = dev_net(skb->dev);
4895c771
DM
561 const struct iphdr *iph = ip_hdr(skb);
562 int oif = skb->dev->ifindex;
563 u8 tos = RT_TOS(iph->tos);
564 u8 prot = iph->protocol;
565 u32 mark = skb->mark;
566
d109e61b 567 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
568}
569
5abf7f7e 570static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
571{
572 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 573 const struct ip_options_rcu *inet_opt;
4895c771
DM
574 __be32 daddr = inet->inet_daddr;
575
576 rcu_read_lock();
577 inet_opt = rcu_dereference(inet->inet_opt);
578 if (inet_opt && inet_opt->opt.srr)
579 daddr = inet_opt->opt.faddr;
580 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
581 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
582 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
583 inet_sk_flowi_flags(sk),
e2d118a1 584 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
585 rcu_read_unlock();
586}
587
5abf7f7e
ED
588static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
589 const struct sk_buff *skb)
4895c771
DM
590{
591 if (skb)
592 build_skb_flow_key(fl4, skb, sk);
593 else
594 build_sk_flow_key(fl4, sk);
595}
596
c5038a83 597static DEFINE_SPINLOCK(fnhe_lock);
4895c771 598
2ffae99d
TT
599static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
600{
601 struct rtable *rt;
602
603 rt = rcu_dereference(fnhe->fnhe_rth_input);
604 if (rt) {
605 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 606 dst_dev_put(&rt->dst);
0830106c 607 dst_release(&rt->dst);
2ffae99d
TT
608 }
609 rt = rcu_dereference(fnhe->fnhe_rth_output);
610 if (rt) {
611 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 612 dst_dev_put(&rt->dst);
0830106c 613 dst_release(&rt->dst);
2ffae99d
TT
614 }
615}
616
aee06da6 617static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
618{
619 struct fib_nh_exception *fnhe, *oldest;
620
621 oldest = rcu_dereference(hash->chain);
622 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
623 fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
625 oldest = fnhe;
626 }
2ffae99d 627 fnhe_flush_routes(oldest);
4895c771
DM
628 return oldest;
629}
630
d3a25c98
DM
631static inline u32 fnhe_hashfun(__be32 daddr)
632{
d546c621 633 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
634 u32 hval;
635
d546c621
ED
636 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
637 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
638 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
639}
640
387aa65a
TT
641static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
642{
643 rt->rt_pmtu = fnhe->fnhe_pmtu;
8387fbac 644 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
645 rt->dst.expires = fnhe->fnhe_expires;
646
647 if (fnhe->fnhe_gw) {
648 rt->rt_flags |= RTCF_REDIRECTED;
649 rt->rt_gateway = fnhe->fnhe_gw;
650 rt->rt_uses_gateway = 1;
651 }
652}
653
aee06da6 654static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
8387fbac 655 u32 pmtu, bool lock, unsigned long expires)
4895c771 656{
aee06da6 657 struct fnhe_hash_bucket *hash;
4895c771 658 struct fib_nh_exception *fnhe;
387aa65a 659 struct rtable *rt;
407de7d9 660 u32 genid, hval;
387aa65a 661 unsigned int i;
4895c771 662 int depth;
407de7d9
XL
663
664 genid = fnhe_genid(dev_net(nh->nh_dev));
665 hval = fnhe_hashfun(daddr);
aee06da6 666
c5038a83 667 spin_lock_bh(&fnhe_lock);
4895c771 668
caa41527 669 hash = rcu_dereference(nh->nh_exceptions);
4895c771 670 if (!hash) {
aee06da6 671 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 672 if (!hash)
aee06da6 673 goto out_unlock;
caa41527 674 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
675 }
676
4895c771
DM
677 hash += hval;
678
679 depth = 0;
680 for (fnhe = rcu_dereference(hash->chain); fnhe;
681 fnhe = rcu_dereference(fnhe->fnhe_next)) {
682 if (fnhe->fnhe_daddr == daddr)
aee06da6 683 break;
4895c771
DM
684 depth++;
685 }
686
aee06da6 687 if (fnhe) {
407de7d9
XL
688 if (fnhe->fnhe_genid != genid)
689 fnhe->fnhe_genid = genid;
aee06da6
JA
690 if (gw)
691 fnhe->fnhe_gw = gw;
8387fbac 692 if (pmtu) {
aee06da6 693 fnhe->fnhe_pmtu = pmtu;
8387fbac
SD
694 fnhe->fnhe_mtu_locked = lock;
695 }
ad199b18 696 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 697 /* Update all cached dsts too */
2ffae99d
TT
698 rt = rcu_dereference(fnhe->fnhe_rth_input);
699 if (rt)
700 fill_route_from_fnhe(rt, fnhe);
701 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
702 if (rt)
703 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
704 } else {
705 if (depth > FNHE_RECLAIM_DEPTH)
706 fnhe = fnhe_oldest(hash);
707 else {
708 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
709 if (!fnhe)
710 goto out_unlock;
711
712 fnhe->fnhe_next = hash->chain;
713 rcu_assign_pointer(hash->chain, fnhe);
714 }
407de7d9 715 fnhe->fnhe_genid = genid;
aee06da6
JA
716 fnhe->fnhe_daddr = daddr;
717 fnhe->fnhe_gw = gw;
718 fnhe->fnhe_pmtu = pmtu;
8387fbac 719 fnhe->fnhe_mtu_locked = lock;
c751af52 720 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
721
722 /* Exception created; mark the cached routes for the nexthop
723 * stale, so anyone caching it rechecks if this exception
724 * applies to them.
725 */
2ffae99d
TT
726 rt = rcu_dereference(nh->nh_rth_input);
727 if (rt)
728 rt->dst.obsolete = DST_OBSOLETE_KILL;
729
387aa65a
TT
730 for_each_possible_cpu(i) {
731 struct rtable __rcu **prt;
732 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
733 rt = rcu_dereference(*prt);
734 if (rt)
735 rt->dst.obsolete = DST_OBSOLETE_KILL;
736 }
4895c771 737 }
4895c771 738
4895c771 739 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
740
741out_unlock:
c5038a83 742 spin_unlock_bh(&fnhe_lock);
4895c771
DM
743}
744
ceb33206
DM
745static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
746 bool kill_route)
1da177e4 747{
e47a185b 748 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 749 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 750 struct net_device *dev = skb->dev;
e47a185b 751 struct in_device *in_dev;
4895c771 752 struct fib_result res;
e47a185b 753 struct neighbour *n;
317805b8 754 struct net *net;
1da177e4 755
94206125
DM
756 switch (icmp_hdr(skb)->code & 7) {
757 case ICMP_REDIR_NET:
758 case ICMP_REDIR_NETTOS:
759 case ICMP_REDIR_HOST:
760 case ICMP_REDIR_HOSTTOS:
761 break;
762
763 default:
764 return;
765 }
766
e47a185b
DM
767 if (rt->rt_gateway != old_gw)
768 return;
769
770 in_dev = __in_dev_get_rcu(dev);
771 if (!in_dev)
772 return;
773
c346dca1 774 net = dev_net(dev);
9d4fb27d
JP
775 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
776 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
777 ipv4_is_zeronet(new_gw))
1da177e4
LT
778 goto reject_redirect;
779
780 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
781 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
782 goto reject_redirect;
783 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
784 goto reject_redirect;
785 } else {
317805b8 786 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
787 goto reject_redirect;
788 }
789
969447f2
SSL
790 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
791 if (!n)
792 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 793 if (!IS_ERR(n)) {
e47a185b
DM
794 if (!(n->nud_state & NUD_VALID)) {
795 neigh_event_send(n, NULL);
796 } else {
0eeb075f 797 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 798 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 799
aee06da6 800 update_or_create_fnhe(nh, fl4->daddr, new_gw,
8387fbac
SD
801 0, false,
802 jiffies + ip_rt_gc_timeout);
4895c771 803 }
ceb33206
DM
804 if (kill_route)
805 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
806 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
807 }
808 neigh_release(n);
809 }
810 return;
811
812reject_redirect:
813#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
814 if (IN_DEV_LOG_MARTIANS(in_dev)) {
815 const struct iphdr *iph = (const struct iphdr *) skb->data;
816 __be32 daddr = iph->daddr;
817 __be32 saddr = iph->saddr;
818
e47a185b
DM
819 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
820 " Advised path = %pI4 -> %pI4\n",
821 &old_gw, dev->name, &new_gw,
822 &saddr, &daddr);
99ee038d 823 }
e47a185b
DM
824#endif
825 ;
826}
827
4895c771
DM
828static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
829{
830 struct rtable *rt;
831 struct flowi4 fl4;
f96ef988 832 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 833 struct net *net = dev_net(skb->dev);
f96ef988
MK
834 int oif = skb->dev->ifindex;
835 u8 tos = RT_TOS(iph->tos);
836 u8 prot = iph->protocol;
837 u32 mark = skb->mark;
4895c771
DM
838
839 rt = (struct rtable *) dst;
840
7d995694 841 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 842 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
843}
844
1da177e4
LT
845static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
846{
ee6b9673 847 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
848 struct dst_entry *ret = dst;
849
850 if (rt) {
d11a4dc1 851 if (dst->obsolete > 0) {
1da177e4
LT
852 ip_rt_put(rt);
853 ret = NULL;
5943634f
DM
854 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
855 rt->dst.expires) {
89aef892 856 ip_rt_put(rt);
1da177e4
LT
857 ret = NULL;
858 }
859 }
860 return ret;
861}
862
863/*
864 * Algorithm:
865 * 1. The first ip_rt_redirect_number redirects are sent
866 * with exponential backoff, then we stop sending them at all,
867 * assuming that the host ignores our redirects.
868 * 2. If we did not see packets requiring redirects
869 * during ip_rt_redirect_silence, we assume that the host
870 * forgot redirected route and start to send redirects again.
871 *
872 * This algorithm is much cheaper and more intelligent than dumb load limiting
873 * in icmp.c.
874 *
875 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
876 * and "frag. need" (breaks PMTU discovery) in icmp.c.
877 */
878
879void ip_rt_send_redirect(struct sk_buff *skb)
880{
511c3f92 881 struct rtable *rt = skb_rtable(skb);
30038fc6 882 struct in_device *in_dev;
92d86829 883 struct inet_peer *peer;
1d861aa4 884 struct net *net;
30038fc6 885 int log_martians;
192132b9 886 int vif;
1da177e4 887
30038fc6 888 rcu_read_lock();
d8d1f30b 889 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
890 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
891 rcu_read_unlock();
1da177e4 892 return;
30038fc6
ED
893 }
894 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 895 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 896 rcu_read_unlock();
1da177e4 897
1d861aa4 898 net = dev_net(rt->dst.dev);
192132b9 899 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 900 if (!peer) {
e81da0e1
JA
901 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
902 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
903 return;
904 }
905
1da177e4
LT
906 /* No redirected packets during ip_rt_redirect_silence;
907 * reset the algorithm.
908 */
4e4d02b9 909 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
92d86829 910 peer->rate_tokens = 0;
4e4d02b9
LB
911 peer->n_redirects = 0;
912 }
1da177e4
LT
913
914 /* Too many ignored redirects; do not send anything
d8d1f30b 915 * set dst.rate_last to the last seen redirected packet.
1da177e4 916 */
4e4d02b9 917 if (peer->n_redirects >= ip_rt_redirect_number) {
92d86829 918 peer->rate_last = jiffies;
1d861aa4 919 goto out_put_peer;
1da177e4
LT
920 }
921
922 /* Check for load limit; set rate_last to the latest sent
923 * redirect.
924 */
92d86829 925 if (peer->rate_tokens == 0 ||
14fb8a76 926 time_after(jiffies,
92d86829
DM
927 (peer->rate_last +
928 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
929 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
930
931 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
932 peer->rate_last = jiffies;
933 ++peer->rate_tokens;
4e4d02b9 934 ++peer->n_redirects;
1da177e4 935#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 936 if (log_martians &&
e87cc472
JP
937 peer->rate_tokens == ip_rt_redirect_number)
938 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 939 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 940 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
941#endif
942 }
1d861aa4
DM
943out_put_peer:
944 inet_putpeer(peer);
1da177e4
LT
945}
946
947static int ip_error(struct sk_buff *skb)
948{
251da413 949 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 950 struct rtable *rt = skb_rtable(skb);
92d86829 951 struct inet_peer *peer;
1da177e4 952 unsigned long now;
251da413 953 struct net *net;
92d86829 954 bool send;
1da177e4
LT
955 int code;
956
381c759d
EB
957 /* IP on this device is disabled. */
958 if (!in_dev)
959 goto out;
960
251da413
DM
961 net = dev_net(rt->dst.dev);
962 if (!IN_DEV_FORWARD(in_dev)) {
963 switch (rt->dst.error) {
964 case EHOSTUNREACH:
b45386ef 965 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
966 break;
967
968 case ENETUNREACH:
b45386ef 969 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
970 break;
971 }
972 goto out;
973 }
974
d8d1f30b 975 switch (rt->dst.error) {
4500ebf8
JP
976 case EINVAL:
977 default:
978 goto out;
979 case EHOSTUNREACH:
980 code = ICMP_HOST_UNREACH;
981 break;
982 case ENETUNREACH:
983 code = ICMP_NET_UNREACH;
b45386ef 984 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
985 break;
986 case EACCES:
987 code = ICMP_PKT_FILTERED;
988 break;
1da177e4
LT
989 }
990
192132b9 991 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 992 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
993
994 send = true;
995 if (peer) {
996 now = jiffies;
997 peer->rate_tokens += now - peer->rate_last;
998 if (peer->rate_tokens > ip_rt_error_burst)
999 peer->rate_tokens = ip_rt_error_burst;
1000 peer->rate_last = now;
1001 if (peer->rate_tokens >= ip_rt_error_cost)
1002 peer->rate_tokens -= ip_rt_error_cost;
1003 else
1004 send = false;
1d861aa4 1005 inet_putpeer(peer);
1da177e4 1006 }
92d86829
DM
1007 if (send)
1008 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1009
1010out: kfree_skb(skb);
1011 return 0;
e905a9ed 1012}
1da177e4 1013
d851c12b 1014static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1015{
d851c12b 1016 struct dst_entry *dst = &rt->dst;
4895c771 1017 struct fib_result res;
8387fbac 1018 bool lock = false;
2c8cec5c 1019
8387fbac 1020 if (ip_mtu_locked(dst))
fa1e492a
SK
1021 return;
1022
cb6ccf09 1023 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1024 return;
1025
8387fbac
SD
1026 if (mtu < ip_rt_min_pmtu) {
1027 lock = true;
5943634f 1028 mtu = ip_rt_min_pmtu;
8387fbac 1029 }
2c8cec5c 1030
f016229e
TT
1031 if (rt->rt_pmtu == mtu &&
1032 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033 return;
1034
c5ae7d41 1035 rcu_read_lock();
0eeb075f 1036 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1037 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1038
8387fbac 1039 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
aee06da6 1040 jiffies + ip_rt_mtu_expires);
4895c771 1041 }
c5ae7d41 1042 rcu_read_unlock();
1da177e4
LT
1043}
1044
4895c771
DM
1045static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046 struct sk_buff *skb, u32 mtu)
1047{
1048 struct rtable *rt = (struct rtable *) dst;
1049 struct flowi4 fl4;
1050
1051 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1052 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1053}
1054
36393395
DM
1055void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1056 int oif, u32 mark, u8 protocol, int flow_flags)
1057{
4895c771 1058 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1059 struct flowi4 fl4;
1060 struct rtable *rt;
1061
1b3c61dc
LC
1062 if (!mark)
1063 mark = IP4_REPLY_MARK(net, skb->mark);
1064
e2d118a1 1065 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1066 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1067 rt = __ip_route_output_key(net, &fl4);
1068 if (!IS_ERR(rt)) {
4895c771 1069 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1070 ip_rt_put(rt);
1071 }
1072}
1073EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1074
9cb3a50c 1075static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1076{
4895c771
DM
1077 const struct iphdr *iph = (const struct iphdr *) skb->data;
1078 struct flowi4 fl4;
1079 struct rtable *rt;
36393395 1080
e2d118a1 1081 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1082
1083 if (!fl4.flowi4_mark)
1084 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1085
4895c771
DM
1086 rt = __ip_route_output_key(sock_net(sk), &fl4);
1087 if (!IS_ERR(rt)) {
1088 __ip_rt_update_pmtu(rt, &fl4, mtu);
1089 ip_rt_put(rt);
1090 }
36393395 1091}
9cb3a50c
SK
1092
1093void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1094{
1095 const struct iphdr *iph = (const struct iphdr *) skb->data;
1096 struct flowi4 fl4;
1097 struct rtable *rt;
7f502361 1098 struct dst_entry *odst = NULL;
b44108db 1099 bool new = false;
e2d118a1 1100 struct net *net = sock_net(sk);
9cb3a50c
SK
1101
1102 bh_lock_sock(sk);
482fc609
HFS
1103
1104 if (!ip_sk_accept_pmtu(sk))
1105 goto out;
1106
7f502361 1107 odst = sk_dst_get(sk);
9cb3a50c 1108
7f502361 1109 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1110 __ipv4_sk_update_pmtu(skb, sk, mtu);
1111 goto out;
1112 }
1113
e2d118a1 1114 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1115
7f502361 1116 rt = (struct rtable *)odst;
51456b29 1117 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1118 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119 if (IS_ERR(rt))
1120 goto out;
b44108db
SK
1121
1122 new = true;
9cb3a50c
SK
1123 }
1124
1125 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1126
7f502361 1127 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1128 if (new)
1129 dst_release(&rt->dst);
1130
9cb3a50c
SK
1131 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1132 if (IS_ERR(rt))
1133 goto out;
1134
b44108db 1135 new = true;
9cb3a50c
SK
1136 }
1137
b44108db 1138 if (new)
7f502361 1139 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1140
1141out:
1142 bh_unlock_sock(sk);
7f502361 1143 dst_release(odst);
9cb3a50c 1144}
36393395 1145EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1146
b42597e2
DM
1147void ipv4_redirect(struct sk_buff *skb, struct net *net,
1148 int oif, u32 mark, u8 protocol, int flow_flags)
1149{
4895c771 1150 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1151 struct flowi4 fl4;
1152 struct rtable *rt;
1153
e2d118a1 1154 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1155 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1156 rt = __ip_route_output_key(net, &fl4);
1157 if (!IS_ERR(rt)) {
ceb33206 1158 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1159 ip_rt_put(rt);
1160 }
1161}
1162EXPORT_SYMBOL_GPL(ipv4_redirect);
1163
1164void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1165{
4895c771
DM
1166 const struct iphdr *iph = (const struct iphdr *) skb->data;
1167 struct flowi4 fl4;
1168 struct rtable *rt;
e2d118a1 1169 struct net *net = sock_net(sk);
b42597e2 1170
e2d118a1
LC
1171 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1172 rt = __ip_route_output_key(net, &fl4);
4895c771 1173 if (!IS_ERR(rt)) {
ceb33206 1174 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1175 ip_rt_put(rt);
1176 }
b42597e2
DM
1177}
1178EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1179
efbc368d
DM
1180static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1181{
1182 struct rtable *rt = (struct rtable *) dst;
1183
ceb33206
DM
1184 /* All IPV4 dsts are created with ->obsolete set to the value
1185 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1186 * into this function always.
1187 *
387aa65a
TT
1188 * When a PMTU/redirect information update invalidates a route,
1189 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1190 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1191 */
387aa65a 1192 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1193 return NULL;
d11a4dc1 1194 return dst;
1da177e4
LT
1195}
1196
1da177e4
LT
1197static void ipv4_link_failure(struct sk_buff *skb)
1198{
1199 struct rtable *rt;
1200
1201 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1202
511c3f92 1203 rt = skb_rtable(skb);
5943634f
DM
1204 if (rt)
1205 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1206}
1207
ede2059d 1208static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1209{
91df42be
JP
1210 pr_debug("%s: %pI4 -> %pI4, %s\n",
1211 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1212 skb->dev ? skb->dev->name : "?");
1da177e4 1213 kfree_skb(skb);
c378a9c0 1214 WARN_ON(1);
1da177e4
LT
1215 return 0;
1216}
1217
1218/*
1219 We do not cache source address of outgoing interface,
1220 because it is used only by IP RR, TS and SRR options,
1221 so that it out of fast path.
1222
1223 BTW remember: "addr" is allowed to be not aligned
1224 in IP options!
1225 */
1226
8e36360a 1227void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1228{
a61ced5d 1229 __be32 src;
1da177e4 1230
c7537967 1231 if (rt_is_output_route(rt))
c5be24ff 1232 src = ip_hdr(skb)->saddr;
ebc0ffae 1233 else {
8e36360a
DM
1234 struct fib_result res;
1235 struct flowi4 fl4;
1236 struct iphdr *iph;
1237
1238 iph = ip_hdr(skb);
1239
1240 memset(&fl4, 0, sizeof(fl4));
1241 fl4.daddr = iph->daddr;
1242 fl4.saddr = iph->saddr;
b0fe4a31 1243 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1244 fl4.flowi4_oif = rt->dst.dev->ifindex;
1245 fl4.flowi4_iif = skb->dev->ifindex;
1246 fl4.flowi4_mark = skb->mark;
5e2b61f7 1247
ebc0ffae 1248 rcu_read_lock();
0eeb075f 1249 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1250 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1251 else
f8126f1d
DM
1252 src = inet_select_addr(rt->dst.dev,
1253 rt_nexthop(rt, iph->daddr),
1254 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1255 rcu_read_unlock();
1256 }
1da177e4
LT
1257 memcpy(addr, &src, 4);
1258}
1259
c7066f70 1260#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1261static void set_class_tag(struct rtable *rt, u32 tag)
1262{
d8d1f30b
CG
1263 if (!(rt->dst.tclassid & 0xFFFF))
1264 rt->dst.tclassid |= tag & 0xFFFF;
1265 if (!(rt->dst.tclassid & 0xFFFF0000))
1266 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1267}
1268#endif
1269
0dbaee3b
DM
1270static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1271{
7ed14d97 1272 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
fdfcb06c 1273 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1274 ip_rt_min_advmss);
0dbaee3b 1275
7ed14d97 1276 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1277}
1278
ebb762f2 1279static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1280{
261663b0 1281 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1282 unsigned int mtu = rt->rt_pmtu;
1283
98d75c37 1284 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1285 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1286
38d523e2 1287 if (mtu)
618f9bc7
SK
1288 return mtu;
1289
c780a049 1290 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1291
8387fbac 1292 if (unlikely(ip_mtu_locked(dst))) {
155e8336 1293 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1294 mtu = 576;
1295 }
1296
14972cbd
RP
1297 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1298
1299 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1300}
1301
c751af52
JA
1302static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1303{
1304 struct fnhe_hash_bucket *hash;
1305 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1306 u32 hval = fnhe_hashfun(daddr);
1307
1308 spin_lock_bh(&fnhe_lock);
1309
1310 hash = rcu_dereference_protected(nh->nh_exceptions,
1311 lockdep_is_held(&fnhe_lock));
1312 hash += hval;
1313
1314 fnhe_p = &hash->chain;
1315 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1316 while (fnhe) {
1317 if (fnhe->fnhe_daddr == daddr) {
1318 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1319 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
68588407
XL
1320 /* set fnhe_daddr to 0 to ensure it won't bind with
1321 * new dsts in rt_bind_exception().
1322 */
1323 fnhe->fnhe_daddr = 0;
c751af52
JA
1324 fnhe_flush_routes(fnhe);
1325 kfree_rcu(fnhe, rcu);
1326 break;
1327 }
1328 fnhe_p = &fnhe->fnhe_next;
1329 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1330 lockdep_is_held(&fnhe_lock));
1331 }
1332
1333 spin_unlock_bh(&fnhe_lock);
1334}
1335
f2bb4bed 1336static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1337{
caa41527 1338 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1339 struct fib_nh_exception *fnhe;
1340 u32 hval;
1341
f2bb4bed
DM
1342 if (!hash)
1343 return NULL;
1344
d3a25c98 1345 hval = fnhe_hashfun(daddr);
4895c771
DM
1346
1347 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1348 fnhe = rcu_dereference(fnhe->fnhe_next)) {
c751af52
JA
1349 if (fnhe->fnhe_daddr == daddr) {
1350 if (fnhe->fnhe_expires &&
1351 time_after(jiffies, fnhe->fnhe_expires)) {
1352 ip_del_fnhe(nh, daddr);
1353 break;
1354 }
f2bb4bed 1355 return fnhe;
c751af52 1356 }
f2bb4bed
DM
1357 }
1358 return NULL;
1359}
aee06da6 1360
caacf05e 1361static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1362 __be32 daddr, const bool do_cache)
f2bb4bed 1363{
caacf05e
DM
1364 bool ret = false;
1365
c5038a83 1366 spin_lock_bh(&fnhe_lock);
f2bb4bed 1367
c5038a83 1368 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1369 struct rtable __rcu **porig;
1370 struct rtable *orig;
5aad1de5 1371 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1372
1373 if (rt_is_input_route(rt))
1374 porig = &fnhe->fnhe_rth_input;
1375 else
1376 porig = &fnhe->fnhe_rth_output;
1377 orig = rcu_dereference(*porig);
5aad1de5
TT
1378
1379 if (fnhe->fnhe_genid != genid) {
1380 fnhe->fnhe_genid = genid;
13d82bf5
SK
1381 fnhe->fnhe_gw = 0;
1382 fnhe->fnhe_pmtu = 0;
1383 fnhe->fnhe_expires = 0;
2ffae99d
TT
1384 fnhe_flush_routes(fnhe);
1385 orig = NULL;
13d82bf5 1386 }
387aa65a
TT
1387 fill_route_from_fnhe(rt, fnhe);
1388 if (!rt->rt_gateway)
155e8336 1389 rt->rt_gateway = daddr;
f2bb4bed 1390
a4c2fd7f 1391 if (do_cache) {
0830106c 1392 dst_hold(&rt->dst);
2ffae99d 1393 rcu_assign_pointer(*porig, rt);
0830106c 1394 if (orig) {
95c47f9c 1395 dst_dev_put(&orig->dst);
0830106c 1396 dst_release(&orig->dst);
0830106c 1397 }
2ffae99d
TT
1398 ret = true;
1399 }
c5038a83
DM
1400
1401 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1402 }
1403 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1404
1405 return ret;
54764bb6
ED
1406}
1407
caacf05e 1408static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1409{
d26b3a7c 1410 struct rtable *orig, *prev, **p;
caacf05e 1411 bool ret = true;
f2bb4bed 1412
d26b3a7c 1413 if (rt_is_input_route(rt)) {
54764bb6 1414 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1415 } else {
903ceff7 1416 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1417 }
f2bb4bed
DM
1418 orig = *p;
1419
0830106c
WW
1420 /* hold dst before doing cmpxchg() to avoid race condition
1421 * on this dst
1422 */
1423 dst_hold(&rt->dst);
f2bb4bed
DM
1424 prev = cmpxchg(p, orig, rt);
1425 if (prev == orig) {
0830106c 1426 if (orig) {
95c47f9c 1427 dst_dev_put(&orig->dst);
0830106c 1428 dst_release(&orig->dst);
0830106c
WW
1429 }
1430 } else {
1431 dst_release(&rt->dst);
caacf05e 1432 ret = false;
0830106c 1433 }
caacf05e
DM
1434
1435 return ret;
1436}
1437
5055c371
ED
1438struct uncached_list {
1439 spinlock_t lock;
1440 struct list_head head;
1441};
1442
1443static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e
DM
1444
1445static void rt_add_uncached_list(struct rtable *rt)
1446{
5055c371
ED
1447 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1448
1449 rt->rt_uncached_list = ul;
1450
1451 spin_lock_bh(&ul->lock);
1452 list_add_tail(&rt->rt_uncached, &ul->head);
1453 spin_unlock_bh(&ul->lock);
caacf05e
DM
1454}
1455
1456static void ipv4_dst_destroy(struct dst_entry *dst)
1457{
3fb07daf 1458 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
caacf05e
DM
1459 struct rtable *rt = (struct rtable *) dst;
1460
9620fef2 1461 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
3fb07daf
ED
1462 kfree(p);
1463
78df76a0 1464 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1465 struct uncached_list *ul = rt->rt_uncached_list;
1466
1467 spin_lock_bh(&ul->lock);
caacf05e 1468 list_del(&rt->rt_uncached);
5055c371 1469 spin_unlock_bh(&ul->lock);
caacf05e
DM
1470 }
1471}
1472
1473void rt_flush_dev(struct net_device *dev)
1474{
5055c371
ED
1475 struct net *net = dev_net(dev);
1476 struct rtable *rt;
1477 int cpu;
1478
1479 for_each_possible_cpu(cpu) {
1480 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1481
5055c371
ED
1482 spin_lock_bh(&ul->lock);
1483 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1484 if (rt->dst.dev != dev)
1485 continue;
1486 rt->dst.dev = net->loopback_dev;
1487 dev_hold(rt->dst.dev);
1488 dev_put(dev);
1489 }
5055c371 1490 spin_unlock_bh(&ul->lock);
4895c771
DM
1491 }
1492}
1493
4331debc 1494static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1495{
4331debc
ED
1496 return rt &&
1497 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498 !rt_is_expired(rt);
d2d68ba9
DM
1499}
1500
f2bb4bed 1501static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1502 const struct fib_result *res,
f2bb4bed 1503 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1504 struct fib_info *fi, u16 type, u32 itag,
1505 const bool do_cache)
1da177e4 1506{
caacf05e
DM
1507 bool cached = false;
1508
1da177e4 1509 if (fi) {
4895c771
DM
1510 struct fib_nh *nh = &FIB_RES_NH(*res);
1511
155e8336 1512 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1513 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1514 rt->rt_uses_gateway = 1;
1515 }
3fb07daf
ED
1516 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517 if (fi->fib_metrics != &dst_default_metrics) {
1518 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
9620fef2 1519 refcount_inc(&fi->fib_metrics->refcnt);
3fb07daf 1520 }
c7066f70 1521#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1522 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1523#endif
61adedf3 1524 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1525 if (unlikely(fnhe))
a4c2fd7f
WW
1526 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527 else if (do_cache)
caacf05e 1528 cached = rt_cache_route(nh, rt);
155e8336
JA
1529 if (unlikely(!cached)) {
1530 /* Routes we intend to cache in nexthop exception or
1531 * FIB nexthop have the DST_NOCACHE bit clear.
1532 * However, if we are unsuccessful at storing this
1533 * route into the cache we really need to set it.
1534 */
155e8336
JA
1535 if (!rt->rt_gateway)
1536 rt->rt_gateway = daddr;
1537 rt_add_uncached_list(rt);
1538 }
1539 } else
caacf05e 1540 rt_add_uncached_list(rt);
defb3519 1541
c7066f70 1542#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1543#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1544 set_class_tag(rt, res->tclassid);
1da177e4
LT
1545#endif
1546 set_class_tag(rt, itag);
1547#endif
1da177e4
LT
1548}
1549
9ab179d8
DA
1550struct rtable *rt_dst_alloc(struct net_device *dev,
1551 unsigned int flags, u16 type,
1552 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1553{
d08c4f35
DA
1554 struct rtable *rt;
1555
1556 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1557 (will_cache ? 0 : DST_HOST) |
d08c4f35 1558 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1559 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1560
1561 if (rt) {
1562 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563 rt->rt_flags = flags;
1564 rt->rt_type = type;
1565 rt->rt_is_input = 0;
1566 rt->rt_iif = 0;
1567 rt->rt_pmtu = 0;
8387fbac 1568 rt->rt_mtu_locked = 0;
d08c4f35
DA
1569 rt->rt_gateway = 0;
1570 rt->rt_uses_gateway = 0;
b7503e0c 1571 rt->rt_table_id = 0;
d08c4f35
DA
1572 INIT_LIST_HEAD(&rt->rt_uncached);
1573
1574 rt->dst.output = ip_output;
1575 if (flags & RTCF_LOCAL)
1576 rt->dst.input = ip_local_deliver;
1577 }
1578
1579 return rt;
0c4dcd58 1580}
9ab179d8 1581EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1582
96d36220 1583/* called in rcu_read_lock() section */
bc044e8d
PA
1584int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1585 u8 tos, struct net_device *dev,
1586 struct in_device *in_dev, u32 *itag)
1da177e4 1587{
b5f7e755 1588 int err;
1da177e4
LT
1589
1590 /* Primary sanity checks. */
51456b29 1591 if (!in_dev)
1da177e4
LT
1592 return -EINVAL;
1593
1e637c74 1594 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1595 skb->protocol != htons(ETH_P_IP))
bc044e8d 1596 return -EINVAL;
1da177e4 1597
75fea73d 1598 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1599 return -EINVAL;
d0daebc3 1600
f97c1e0c
JP
1601 if (ipv4_is_zeronet(saddr)) {
1602 if (!ipv4_is_local_multicast(daddr))
bc044e8d 1603 return -EINVAL;
b5f7e755 1604 } else {
9e56e380 1605 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1606 in_dev, itag);
b5f7e755 1607 if (err < 0)
bc044e8d 1608 return err;
b5f7e755 1609 }
bc044e8d
PA
1610 return 0;
1611}
1612
1613/* called in rcu_read_lock() section */
1614static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1615 u8 tos, struct net_device *dev, int our)
1616{
1617 struct in_device *in_dev = __in_dev_get_rcu(dev);
1618 unsigned int flags = RTCF_MULTICAST;
1619 struct rtable *rth;
1620 u32 itag = 0;
1621 int err;
1622
1623 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1624 if (err)
1625 return err;
1626
d08c4f35
DA
1627 if (our)
1628 flags |= RTCF_LOCAL;
1629
1630 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1631 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1632 if (!rth)
bc044e8d 1633 return -ENOBUFS;
1da177e4 1634
cf911662
DM
1635#ifdef CONFIG_IP_ROUTE_CLASSID
1636 rth->dst.tclassid = itag;
1637#endif
d8d1f30b 1638 rth->dst.output = ip_rt_bug;
9917e1e8 1639 rth->rt_is_input= 1;
1da177e4
LT
1640
1641#ifdef CONFIG_IP_MROUTE
f97c1e0c 1642 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1643 rth->dst.input = ip_mr_input;
1da177e4
LT
1644#endif
1645 RT_CACHE_STAT_INC(in_slow_mc);
1646
89aef892
DM
1647 skb_dst_set(skb, &rth->dst);
1648 return 0;
1da177e4
LT
1649}
1650
1651
1652static void ip_handle_martian_source(struct net_device *dev,
1653 struct in_device *in_dev,
1654 struct sk_buff *skb,
9e12bb22
AV
1655 __be32 daddr,
1656 __be32 saddr)
1da177e4
LT
1657{
1658 RT_CACHE_STAT_INC(in_martian_src);
1659#ifdef CONFIG_IP_ROUTE_VERBOSE
1660 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1661 /*
1662 * RFC1812 recommendation, if source is martian,
1663 * the only hint is MAC header.
1664 */
058bd4d2 1665 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1666 &daddr, &saddr, dev->name);
98e399f8 1667 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1668 print_hex_dump(KERN_WARNING, "ll header: ",
1669 DUMP_PREFIX_OFFSET, 16, 1,
1670 skb_mac_header(skb),
1671 dev->hard_header_len, true);
1da177e4
LT
1672 }
1673 }
1674#endif
1675}
1676
efd85700
TG
1677static void set_lwt_redirect(struct rtable *rth)
1678{
1679 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1680 rth->dst.lwtstate->orig_output = rth->dst.output;
1681 rth->dst.output = lwtunnel_output;
1682 }
1683
1684 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1685 rth->dst.lwtstate->orig_input = rth->dst.input;
1686 rth->dst.input = lwtunnel_input;
1687 }
1688}
1689
47360228 1690/* called in rcu_read_lock() section */
5969f71d 1691static int __mkroute_input(struct sk_buff *skb,
982721f3 1692 const struct fib_result *res,
5969f71d 1693 struct in_device *in_dev,
c6cffba4 1694 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1695{
2ffae99d 1696 struct fib_nh_exception *fnhe;
1da177e4
LT
1697 struct rtable *rth;
1698 int err;
1699 struct in_device *out_dev;
d2d68ba9 1700 bool do_cache;
fbdc0ad0 1701 u32 itag = 0;
1da177e4
LT
1702
1703 /* get a working reference to the output device */
47360228 1704 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1705 if (!out_dev) {
e87cc472 1706 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1707 return -EINVAL;
1708 }
1709
5c04c819 1710 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1711 in_dev->dev, in_dev, &itag);
1da177e4 1712 if (err < 0) {
e905a9ed 1713 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1714 saddr);
e905a9ed 1715
1da177e4
LT
1716 goto cleanup;
1717 }
1718
e81da0e1
JA
1719 do_cache = res->fi && !itag;
1720 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1721 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1722 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1723 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1724 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1725
1726 if (skb->protocol != htons(ETH_P_IP)) {
1727 /* Not IP (i.e. ARP). Do not create route, if it is
1728 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1729 *
1730 * Proxy arp feature have been extended to allow, ARP
1731 * replies back to the same interface, to support
1732 * Private VLAN switch technologies. See arp.c.
1da177e4 1733 */
65324144
JDB
1734 if (out_dev == in_dev &&
1735 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1736 err = -EINVAL;
1737 goto cleanup;
1738 }
1739 }
1740
2ffae99d 1741 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1742 if (do_cache) {
c751af52 1743 if (fnhe)
2ffae99d 1744 rth = rcu_dereference(fnhe->fnhe_rth_input);
c751af52
JA
1745 else
1746 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
e81da0e1
JA
1747 if (rt_cache_valid(rth)) {
1748 skb_dst_set_noref(skb, &rth->dst);
1749 goto out;
d2d68ba9
DM
1750 }
1751 }
f2bb4bed 1752
d08c4f35 1753 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1754 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1755 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1756 if (!rth) {
1757 err = -ENOBUFS;
1758 goto cleanup;
1759 }
1760
9917e1e8 1761 rth->rt_is_input = 1;
b7503e0c
DA
1762 if (res->table)
1763 rth->rt_table_id = res->table->tb_id;
a6254864 1764 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1765
d8d1f30b 1766 rth->dst.input = ip_forward;
1da177e4 1767
a4c2fd7f
WW
1768 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1769 do_cache);
efd85700 1770 set_lwt_redirect(rth);
c6cffba4 1771 skb_dst_set(skb, &rth->dst);
d2d68ba9 1772out:
1da177e4
LT
1773 err = 0;
1774 cleanup:
1da177e4 1775 return err;
e905a9ed 1776}
1da177e4 1777
79a13159 1778#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1779/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1780 * calculated from the inner IP addresses.
79a13159 1781 */
bf4e0a3d
NA
1782static void ip_multipath_l3_keys(const struct sk_buff *skb,
1783 struct flow_keys *hash_keys)
79a13159
PN
1784{
1785 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1786 const struct iphdr *inner_iph;
79a13159
PN
1787 const struct icmphdr *icmph;
1788 struct iphdr _inner_iph;
bf4e0a3d
NA
1789 struct icmphdr _icmph;
1790
1791 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1792 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1793 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1794 return;
79a13159
PN
1795
1796 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1797 return;
79a13159
PN
1798
1799 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1800 &_icmph);
1801 if (!icmph)
bf4e0a3d 1802 return;
79a13159
PN
1803
1804 if (icmph->type != ICMP_DEST_UNREACH &&
1805 icmph->type != ICMP_REDIRECT &&
1806 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1807 icmph->type != ICMP_PARAMETERPROB)
1808 return;
79a13159
PN
1809
1810 inner_iph = skb_header_pointer(skb,
1811 outer_iph->ihl * 4 + sizeof(_icmph),
1812 sizeof(_inner_iph), &_inner_iph);
1813 if (!inner_iph)
bf4e0a3d
NA
1814 return;
1815 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1816 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1817}
79a13159 1818
bf4e0a3d
NA
1819/* if skb is set it will be used and fl4 can be NULL */
1820int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1821 const struct sk_buff *skb)
1822{
1823 struct net *net = fi->fib_net;
1824 struct flow_keys hash_keys;
1825 u32 mhash;
79a13159 1826
bf4e0a3d
NA
1827 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1828 case 0:
1829 memset(&hash_keys, 0, sizeof(hash_keys));
1830 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1831 if (skb) {
1832 ip_multipath_l3_keys(skb, &hash_keys);
1833 } else {
1834 hash_keys.addrs.v4addrs.src = fl4->saddr;
1835 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1836 }
1837 break;
1838 case 1:
1839 /* skb is currently provided only when forwarding */
1840 if (skb) {
1841 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1842 struct flow_keys keys;
1843
1844 /* short-circuit if we already have L4 hash present */
1845 if (skb->l4_hash)
1846 return skb_get_hash_raw(skb) >> 1;
1847 memset(&hash_keys, 0, sizeof(hash_keys));
1848 skb_flow_dissect_flow_keys(skb, &keys, flag);
97ba6e5f
DA
1849
1850 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
bf4e0a3d
NA
1851 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1852 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1853 hash_keys.ports.src = keys.ports.src;
1854 hash_keys.ports.dst = keys.ports.dst;
1855 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1856 } else {
1857 memset(&hash_keys, 0, sizeof(hash_keys));
1858 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1859 hash_keys.addrs.v4addrs.src = fl4->saddr;
1860 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1861 hash_keys.ports.src = fl4->fl4_sport;
1862 hash_keys.ports.dst = fl4->fl4_dport;
1863 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1864 }
1865 break;
1866 }
1867 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1868
bf4e0a3d
NA
1869 return mhash >> 1;
1870}
1871EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1872#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1873
5969f71d
SH
1874static int ip_mkroute_input(struct sk_buff *skb,
1875 struct fib_result *res,
5969f71d
SH
1876 struct in_device *in_dev,
1877 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1878{
1da177e4 1879#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1880 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1881 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1882
0e884c78
PN
1883 fib_select_multipath(res, h);
1884 }
1da177e4
LT
1885#endif
1886
1887 /* create a routing cache entry */
c6cffba4 1888 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1889}
1890
1da177e4
LT
1891/*
1892 * NOTE. We drop all the packets that has local source
1893 * addresses, because every properly looped back packet
1894 * must have correct destination already attached by output routine.
1895 *
1896 * Such approach solves two big problems:
1897 * 1. Not simplex devices are handled properly.
1898 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1899 * called with rcu_read_lock()
1da177e4
LT
1900 */
1901
9e12bb22 1902static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1903 u8 tos, struct net_device *dev,
1904 struct fib_result *res)
1da177e4 1905{
96d36220 1906 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1907 struct ip_tunnel_info *tun_info;
68a5e3dd 1908 struct flowi4 fl4;
95c96174 1909 unsigned int flags = 0;
1da177e4 1910 u32 itag = 0;
95c96174 1911 struct rtable *rth;
1da177e4 1912 int err = -EINVAL;
5e73ea1a 1913 struct net *net = dev_net(dev);
d2d68ba9 1914 bool do_cache;
1da177e4
LT
1915
1916 /* IP on this device is disabled. */
1917
1918 if (!in_dev)
1919 goto out;
1920
1921 /* Check for the most weird martians, which can be not detected
1922 by fib_lookup.
1923 */
1924
61adedf3 1925 tun_info = skb_tunnel_info(skb);
46fa062a 1926 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1927 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1928 else
1929 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1930 skb_dst_drop(skb);
1931
d0daebc3 1932 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1933 goto martian_source;
1934
5510cdf7
DA
1935 res->fi = NULL;
1936 res->table = NULL;
27a954bd 1937 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1938 goto brd_input;
1939
1940 /* Accept zero addresses only to limited broadcast;
1941 * I even do not know to fix it or not. Waiting for complains :-)
1942 */
f97c1e0c 1943 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1944 goto martian_source;
1945
d0daebc3 1946 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1947 goto martian_destination;
1948
9eb43e76
ED
1949 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1950 * and call it once if daddr or/and saddr are loopback addresses
1951 */
1952 if (ipv4_is_loopback(daddr)) {
1953 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1954 goto martian_destination;
9eb43e76
ED
1955 } else if (ipv4_is_loopback(saddr)) {
1956 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1957 goto martian_source;
1958 }
1959
1da177e4
LT
1960 /*
1961 * Now we are ready to route packet.
1962 */
68a5e3dd 1963 fl4.flowi4_oif = 0;
e0d56fdd 1964 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1965 fl4.flowi4_mark = skb->mark;
1966 fl4.flowi4_tos = tos;
1967 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1968 fl4.flowi4_flags = 0;
68a5e3dd
DM
1969 fl4.daddr = daddr;
1970 fl4.saddr = saddr;
8bcfd092 1971 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1972 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1973 if (err != 0) {
1974 if (!IN_DEV_FORWARD(in_dev))
1975 err = -EHOSTUNREACH;
1da177e4 1976 goto no_route;
cd0f0b95 1977 }
1da177e4 1978
5510cdf7 1979 if (res->type == RTN_BROADCAST)
1da177e4
LT
1980 goto brd_input;
1981
5510cdf7 1982 if (res->type == RTN_LOCAL) {
5c04c819 1983 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1984 0, dev, in_dev, &itag);
b5f7e755 1985 if (err < 0)
0d753960 1986 goto martian_source;
1da177e4
LT
1987 goto local_input;
1988 }
1989
cd0f0b95
DJ
1990 if (!IN_DEV_FORWARD(in_dev)) {
1991 err = -EHOSTUNREACH;
251da413 1992 goto no_route;
cd0f0b95 1993 }
5510cdf7 1994 if (res->type != RTN_UNICAST)
1da177e4
LT
1995 goto martian_destination;
1996
5510cdf7 1997 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1998out: return err;
1999
2000brd_input:
2001 if (skb->protocol != htons(ETH_P_IP))
2002 goto e_inval;
2003
41347dcd 2004 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2005 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006 in_dev, &itag);
1da177e4 2007 if (err < 0)
0d753960 2008 goto martian_source;
1da177e4
LT
2009 }
2010 flags |= RTCF_BROADCAST;
5510cdf7 2011 res->type = RTN_BROADCAST;
1da177e4
LT
2012 RT_CACHE_STAT_INC(in_brd);
2013
2014local_input:
d2d68ba9 2015 do_cache = false;
5510cdf7 2016 if (res->fi) {
fe3edf45 2017 if (!itag) {
5510cdf7 2018 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 2019 if (rt_cache_valid(rth)) {
c6cffba4
DM
2020 skb_dst_set_noref(skb, &rth->dst);
2021 err = 0;
2022 goto out;
d2d68ba9
DM
2023 }
2024 do_cache = true;
2025 }
2026 }
2027
f5a0aab8 2028 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2029 flags | RTCF_LOCAL, res->type,
d2d68ba9 2030 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2031 if (!rth)
2032 goto e_nobufs;
2033
d8d1f30b 2034 rth->dst.output= ip_rt_bug;
cf911662
DM
2035#ifdef CONFIG_IP_ROUTE_CLASSID
2036 rth->dst.tclassid = itag;
2037#endif
9917e1e8 2038 rth->rt_is_input = 1;
5510cdf7
DA
2039 if (res->table)
2040 rth->rt_table_id = res->table->tb_id;
571e7226 2041
a6254864 2042 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2043 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2044 rth->dst.input= ip_error;
2045 rth->dst.error= -err;
1da177e4
LT
2046 rth->rt_flags &= ~RTCF_LOCAL;
2047 }
efd85700 2048
dcdfdf56 2049 if (do_cache) {
5510cdf7 2050 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2051
2052 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2053 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2054 WARN_ON(rth->dst.input == lwtunnel_input);
2055 rth->dst.lwtstate->orig_input = rth->dst.input;
2056 rth->dst.input = lwtunnel_input;
2057 }
2058
a4c2fd7f 2059 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2060 rt_add_uncached_list(rth);
dcdfdf56 2061 }
89aef892 2062 skb_dst_set(skb, &rth->dst);
b23dd4fe 2063 err = 0;
ebc0ffae 2064 goto out;
1da177e4
LT
2065
2066no_route:
2067 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2068 res->type = RTN_UNREACHABLE;
2069 res->fi = NULL;
2070 res->table = NULL;
1da177e4
LT
2071 goto local_input;
2072
2073 /*
2074 * Do not cache martian addresses: they should be logged (RFC1812)
2075 */
2076martian_destination:
2077 RT_CACHE_STAT_INC(in_martian_dst);
2078#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2079 if (IN_DEV_LOG_MARTIANS(in_dev))
2080 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2081 &daddr, &saddr, dev->name);
1da177e4 2082#endif
2c2910a4 2083
1da177e4
LT
2084e_inval:
2085 err = -EINVAL;
ebc0ffae 2086 goto out;
1da177e4
LT
2087
2088e_nobufs:
2089 err = -ENOBUFS;
ebc0ffae 2090 goto out;
1da177e4
LT
2091
2092martian_source:
2093 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2094 goto out;
1da177e4
LT
2095}
2096
c6cffba4
DM
2097int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098 u8 tos, struct net_device *dev)
1da177e4 2099{
5510cdf7
DA
2100 struct fib_result res;
2101 int err;
1da177e4 2102
6e28099d 2103 tos &= IPTOS_RT_MASK;
96d36220 2104 rcu_read_lock();
5510cdf7
DA
2105 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2106 rcu_read_unlock();
96d36220 2107
5510cdf7
DA
2108 return err;
2109}
2110EXPORT_SYMBOL(ip_route_input_noref);
2111
2112/* called with rcu_read_lock held */
2113int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2114 u8 tos, struct net_device *dev, struct fib_result *res)
2115{
1da177e4
LT
2116 /* Multicast recognition logic is moved from route cache to here.
2117 The problem was that too many Ethernet cards have broken/missing
2118 hardware multicast filters :-( As result the host on multicasting
2119 network acquires a lot of useless route cache entries, sort of
2120 SDR messages from all the world. Now we try to get rid of them.
2121 Really, provided software IP multicast filter is organized
2122 reasonably (at least, hashed), it does not result in a slowdown
2123 comparing with route cache reject entries.
2124 Note, that multicast routers are not affected, because
2125 route cache entry is created eventually.
2126 */
f97c1e0c 2127 if (ipv4_is_multicast(daddr)) {
96d36220 2128 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2129 int our = 0;
5510cdf7 2130 int err = -EINVAL;
1da177e4 2131
c8c6b846
PA
2132 if (!in_dev)
2133 return err;
2134 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2135 ip_hdr(skb)->protocol);
e58e4159
DA
2136
2137 /* check l3 master if no match yet */
c8c6b846 2138 if (!our && netif_is_l3_slave(dev)) {
e58e4159
DA
2139 struct in_device *l3_in_dev;
2140
2141 l3_in_dev = __in_dev_get_rcu(skb->dev);
2142 if (l3_in_dev)
2143 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2144 ip_hdr(skb)->protocol);
2145 }
2146
e58e4159 2147 if (our
1da177e4 2148#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2149 ||
2150 (!ipv4_is_local_multicast(daddr) &&
2151 IN_DEV_MFORWARD(in_dev))
1da177e4 2152#endif
e58e4159 2153 ) {
5510cdf7 2154 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2155 tos, dev, our);
1da177e4 2156 }
5510cdf7 2157 return err;
1da177e4 2158 }
5510cdf7
DA
2159
2160 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2161}
2162
ebc0ffae 2163/* called with rcu_read_lock() */
982721f3 2164static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2165 const struct flowi4 *fl4, int orig_oif,
f61759e6 2166 struct net_device *dev_out,
5ada5527 2167 unsigned int flags)
1da177e4 2168{
982721f3 2169 struct fib_info *fi = res->fi;
f2bb4bed 2170 struct fib_nh_exception *fnhe;
5ada5527 2171 struct in_device *in_dev;
982721f3 2172 u16 type = res->type;
5ada5527 2173 struct rtable *rth;
c92b9655 2174 bool do_cache;
1da177e4 2175
d0daebc3
TG
2176 in_dev = __in_dev_get_rcu(dev_out);
2177 if (!in_dev)
5ada5527 2178 return ERR_PTR(-EINVAL);
1da177e4 2179
d0daebc3 2180 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2181 if (ipv4_is_loopback(fl4->saddr) &&
2182 !(dev_out->flags & IFF_LOOPBACK) &&
2183 !netif_is_l3_master(dev_out))
d0daebc3
TG
2184 return ERR_PTR(-EINVAL);
2185
68a5e3dd 2186 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2187 type = RTN_BROADCAST;
68a5e3dd 2188 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2189 type = RTN_MULTICAST;
68a5e3dd 2190 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2191 return ERR_PTR(-EINVAL);
1da177e4
LT
2192
2193 if (dev_out->flags & IFF_LOOPBACK)
2194 flags |= RTCF_LOCAL;
2195
63617421 2196 do_cache = true;
982721f3 2197 if (type == RTN_BROADCAST) {
1da177e4 2198 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2199 fi = NULL;
2200 } else if (type == RTN_MULTICAST) {
dd28d1a0 2201 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2202 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2203 fl4->flowi4_proto))
1da177e4 2204 flags &= ~RTCF_LOCAL;
63617421
JA
2205 else
2206 do_cache = false;
1da177e4 2207 /* If multicast route do not exist use
dd28d1a0
ED
2208 * default one, but do not gateway in this case.
2209 * Yes, it is hack.
1da177e4 2210 */
982721f3
DM
2211 if (fi && res->prefixlen < 4)
2212 fi = NULL;
d6d5e999
CF
2213 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2214 (orig_oif != dev_out->ifindex)) {
2215 /* For local routes that require a particular output interface
2216 * we do not want to cache the result. Caching the result
2217 * causes incorrect behaviour when there are multiple source
2218 * addresses on the interface, the end result being that if the
2219 * intended recipient is waiting on that interface for the
2220 * packet he won't receive it because it will be delivered on
2221 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2222 * be set to the loopback interface as well.
2223 */
c751af52 2224 do_cache = false;
1da177e4
LT
2225 }
2226
f2bb4bed 2227 fnhe = NULL;
63617421 2228 do_cache &= fi != NULL;
c751af52 2229 if (fi) {
c5038a83 2230 struct rtable __rcu **prth;
c92b9655 2231 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2232
c92b9655 2233 fnhe = find_exception(nh, fl4->daddr);
c751af52
JA
2234 if (!do_cache)
2235 goto add;
deed49df 2236 if (fnhe) {
2ffae99d 2237 prth = &fnhe->fnhe_rth_output;
c751af52
JA
2238 } else {
2239 if (unlikely(fl4->flowi4_flags &
2240 FLOWI_FLAG_KNOWN_NH &&
2241 !(nh->nh_gw &&
2242 nh->nh_scope == RT_SCOPE_LINK))) {
2243 do_cache = false;
2244 goto add;
c92b9655 2245 }
c751af52 2246 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c92b9655 2247 }
c5038a83 2248 rth = rcu_dereference(*prth);
9df16efa 2249 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2250 return rth;
f2bb4bed 2251 }
c92b9655
JA
2252
2253add:
d08c4f35 2254 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2255 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2256 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2257 do_cache);
8391d07b 2258 if (!rth)
5ada5527 2259 return ERR_PTR(-ENOBUFS);
8391d07b 2260
9438c871 2261 rth->rt_iif = orig_oif;
b7503e0c
DA
2262 if (res->table)
2263 rth->rt_table_id = res->table->tb_id;
2264
1da177e4
LT
2265 RT_CACHE_STAT_INC(out_slow_tot);
2266
1da177e4 2267 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2268 if (flags & RTCF_LOCAL &&
1da177e4 2269 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2270 rth->dst.output = ip_mc_output;
1da177e4
LT
2271 RT_CACHE_STAT_INC(out_slow_mc);
2272 }
2273#ifdef CONFIG_IP_MROUTE
982721f3 2274 if (type == RTN_MULTICAST) {
1da177e4 2275 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2276 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2277 rth->dst.input = ip_mr_input;
2278 rth->dst.output = ip_mc_output;
1da177e4
LT
2279 }
2280 }
2281#endif
2282 }
2283
a4c2fd7f 2284 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
efd85700 2285 set_lwt_redirect(rth);
1da177e4 2286
5ada5527 2287 return rth;
1da177e4
LT
2288}
2289
1da177e4
LT
2290/*
2291 * Major route resolver routine.
2292 */
2293
3abd1ade
DA
2294struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2295 const struct sk_buff *skb)
1da177e4 2296{
f61759e6 2297 __u8 tos = RT_FL_TOS(fl4);
154ff3e0
ED
2298 struct fib_result res = {
2299 .type = RTN_UNSPEC,
2300 .fi = NULL,
2301 .table = NULL,
2302 .tclassid = 0,
2303 };
5ada5527 2304 struct rtable *rth;
1da177e4 2305
1fb9489b 2306 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2307 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2308 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2309 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2310
010c2708 2311 rcu_read_lock();
3abd1ade
DA
2312 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2313 rcu_read_unlock();
2314
2315 return rth;
2316}
2317EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2318
2319struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2320 struct fib_result *res,
2321 const struct sk_buff *skb)
2322{
2323 struct net_device *dev_out = NULL;
2324 int orig_oif = fl4->flowi4_oif;
2325 unsigned int flags = 0;
2326 struct rtable *rth;
2327 int err = -ENETUNREACH;
2328
813b3b5d 2329 if (fl4->saddr) {
b23dd4fe 2330 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2331 if (ipv4_is_multicast(fl4->saddr) ||
2332 ipv4_is_lbcast(fl4->saddr) ||
2333 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2334 goto out;
2335
1da177e4
LT
2336 /* I removed check for oif == dev_out->oif here.
2337 It was wrong for two reasons:
1ab35276
DL
2338 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2339 is assigned to multiple interfaces.
1da177e4
LT
2340 2. Moreover, we are allowed to send packets with saddr
2341 of another iface. --ANK
2342 */
2343
813b3b5d
DM
2344 if (fl4->flowi4_oif == 0 &&
2345 (ipv4_is_multicast(fl4->daddr) ||
2346 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2347 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2348 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2349 if (!dev_out)
a210d01a
JA
2350 goto out;
2351
1da177e4
LT
2352 /* Special hack: user can direct multicasts
2353 and limited broadcast via necessary interface
2354 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2355 This hack is not just for fun, it allows
2356 vic,vat and friends to work.
2357 They bind socket to loopback, set ttl to zero
2358 and expect that it will work.
2359 From the viewpoint of routing cache they are broken,
2360 because we are not allowed to build multicast path
2361 with loopback source addr (look, routing cache
2362 cannot know, that ttl is zero, so that packet
2363 will not leave this host and route is valid).
2364 Luckily, this hack is good workaround.
2365 */
2366
813b3b5d 2367 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2368 goto make_route;
2369 }
a210d01a 2370
813b3b5d 2371 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2372 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2373 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2374 goto out;
a210d01a 2375 }
1da177e4
LT
2376 }
2377
2378
813b3b5d
DM
2379 if (fl4->flowi4_oif) {
2380 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2381 rth = ERR_PTR(-ENODEV);
51456b29 2382 if (!dev_out)
1da177e4 2383 goto out;
e5ed6399
HX
2384
2385 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2386 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2387 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2388 goto out;
2389 }
813b3b5d 2390 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2391 ipv4_is_lbcast(fl4->daddr) ||
2392 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2393 if (!fl4->saddr)
2394 fl4->saddr = inet_select_addr(dev_out, 0,
2395 RT_SCOPE_LINK);
1da177e4
LT
2396 goto make_route;
2397 }
0a7e2260 2398 if (!fl4->saddr) {
813b3b5d
DM
2399 if (ipv4_is_multicast(fl4->daddr))
2400 fl4->saddr = inet_select_addr(dev_out, 0,
2401 fl4->flowi4_scope);
2402 else if (!fl4->daddr)
2403 fl4->saddr = inet_select_addr(dev_out, 0,
2404 RT_SCOPE_HOST);
1da177e4
LT
2405 }
2406 }
2407
813b3b5d
DM
2408 if (!fl4->daddr) {
2409 fl4->daddr = fl4->saddr;
2410 if (!fl4->daddr)
2411 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2412 dev_out = net->loopback_dev;
1fb9489b 2413 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2414 res->type = RTN_LOCAL;
1da177e4
LT
2415 flags |= RTCF_LOCAL;
2416 goto make_route;
2417 }
2418
3abd1ade 2419 err = fib_lookup(net, fl4, res, 0);
0315e382 2420 if (err) {
3abd1ade
DA
2421 res->fi = NULL;
2422 res->table = NULL;
6104e112 2423 if (fl4->flowi4_oif &&
e58e4159
DA
2424 (ipv4_is_multicast(fl4->daddr) ||
2425 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2426 /* Apparently, routing tables are wrong. Assume,
2427 that the destination is on link.
2428
2429 WHY? DW.
2430 Because we are allowed to send to iface
2431 even if it has NO routes and NO assigned
2432 addresses. When oif is specified, routing
2433 tables are looked up with only one purpose:
2434 to catch if destination is gatewayed, rather than
2435 direct. Moreover, if MSG_DONTROUTE is set,
2436 we send packet, ignoring both routing tables
2437 and ifaddr state. --ANK
2438
2439
2440 We could make it even if oif is unknown,
2441 likely IPv6, but we do not.
2442 */
2443
813b3b5d
DM
2444 if (fl4->saddr == 0)
2445 fl4->saddr = inet_select_addr(dev_out, 0,
2446 RT_SCOPE_LINK);
3abd1ade 2447 res->type = RTN_UNICAST;
1da177e4
LT
2448 goto make_route;
2449 }
0315e382 2450 rth = ERR_PTR(err);
1da177e4
LT
2451 goto out;
2452 }
1da177e4 2453
3abd1ade 2454 if (res->type == RTN_LOCAL) {
813b3b5d 2455 if (!fl4->saddr) {
3abd1ade
DA
2456 if (res->fi->fib_prefsrc)
2457 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2458 else
813b3b5d 2459 fl4->saddr = fl4->daddr;
9fc3bbb4 2460 }
5f02ce24
DA
2461
2462 /* L3 master device is the loopback for that domain */
3abd1ade 2463 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2464 net->loopback_dev;
839da4d9
DA
2465
2466 /* make sure orig_oif points to fib result device even
2467 * though packet rx/tx happens over loopback or l3mdev
2468 */
2469 orig_oif = FIB_RES_OIF(*res);
2470
813b3b5d 2471 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2472 flags |= RTCF_LOCAL;
2473 goto make_route;
2474 }
2475
3abd1ade 2476 fib_select_path(net, res, fl4, skb);
1da177e4 2477
3abd1ade 2478 dev_out = FIB_RES_DEV(*res);
813b3b5d 2479 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2480
2481
2482make_route:
3abd1ade 2483 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2484
010c2708 2485out:
b23dd4fe 2486 return rth;
1da177e4 2487}
d8c97a94 2488
ae2688d5
JW
2489static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2490{
2491 return NULL;
2492}
2493
ebb762f2 2494static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2495{
618f9bc7
SK
2496 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2497
2498 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2499}
2500
6700c270
DM
2501static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2502 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2503{
2504}
2505
6700c270
DM
2506static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2507 struct sk_buff *skb)
b587ee3b
DM
2508{
2509}
2510
0972ddb2
HB
2511static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2512 unsigned long old)
2513{
2514 return NULL;
2515}
2516
14e50e57
DM
2517static struct dst_ops ipv4_dst_blackhole_ops = {
2518 .family = AF_INET,
ae2688d5 2519 .check = ipv4_blackhole_dst_check,
ebb762f2 2520 .mtu = ipv4_blackhole_mtu,
214f45c9 2521 .default_advmss = ipv4_default_advmss,
14e50e57 2522 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2523 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2524 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2525 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2526};
2527
2774c131 2528struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2529{
2774c131 2530 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2531 struct rtable *rt;
14e50e57 2532
6c0e7284 2533 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2534 if (rt) {
d8d1f30b 2535 struct dst_entry *new = &rt->dst;
14e50e57 2536
14e50e57 2537 new->__use = 1;
352e512c 2538 new->input = dst_discard;
ede2059d 2539 new->output = dst_discard_out;
14e50e57 2540
1dbe3252 2541 new->dev = net->loopback_dev;
14e50e57
DM
2542 if (new->dev)
2543 dev_hold(new->dev);
2544
9917e1e8 2545 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2546 rt->rt_iif = ort->rt_iif;
5943634f 2547 rt->rt_pmtu = ort->rt_pmtu;
8387fbac 2548 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2549
ca4c3fc2 2550 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2551 rt->rt_flags = ort->rt_flags;
2552 rt->rt_type = ort->rt_type;
14e50e57 2553 rt->rt_gateway = ort->rt_gateway;
155e8336 2554 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2555
caacf05e 2556 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2557 }
2558
2774c131
DM
2559 dst_release(dst_orig);
2560
2561 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2562}
2563
9d6ec938 2564struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2565 const struct sock *sk)
1da177e4 2566{
9d6ec938 2567 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2568
b23dd4fe
DM
2569 if (IS_ERR(rt))
2570 return rt;
1da177e4 2571
56157872 2572 if (flp4->flowi4_proto)
f92ee619
SK
2573 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2574 flowi4_to_flowi(flp4),
2575 sk, 0);
1da177e4 2576
b23dd4fe 2577 return rt;
1da177e4 2578}
d8c97a94
ACM
2579EXPORT_SYMBOL_GPL(ip_route_output_flow);
2580
3765d35e 2581/* called with rcu_read_lock held */
c36ba660 2582static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2583 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2584 u32 seq)
1da177e4 2585{
ba52d61e 2586 struct rtable *rt = skb_rtable(skb);
1da177e4 2587 struct rtmsg *r;
be403ea1 2588 struct nlmsghdr *nlh;
2bc8ca40 2589 unsigned long expires = 0;
f185071d 2590 u32 error;
521f5490 2591 u32 metrics[RTAX_MAX];
be403ea1 2592
d3166e0c 2593 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2594 if (!nlh)
26932566 2595 return -EMSGSIZE;
be403ea1
TG
2596
2597 r = nlmsg_data(nlh);
1da177e4
LT
2598 r->rtm_family = AF_INET;
2599 r->rtm_dst_len = 32;
2600 r->rtm_src_len = 0;
d6c0a4f6 2601 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2602 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2603 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2604 goto nla_put_failure;
1da177e4
LT
2605 r->rtm_type = rt->rt_type;
2606 r->rtm_scope = RT_SCOPE_UNIVERSE;
2607 r->rtm_protocol = RTPROT_UNSPEC;
2608 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2609 if (rt->rt_flags & RTCF_NOTIFY)
2610 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2611 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2612 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2613
930345ea 2614 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2615 goto nla_put_failure;
1a00fee4 2616 if (src) {
1da177e4 2617 r->rtm_src_len = 32;
930345ea 2618 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2619 goto nla_put_failure;
1da177e4 2620 }
f3756b79
DM
2621 if (rt->dst.dev &&
2622 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2623 goto nla_put_failure;
c7066f70 2624#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2625 if (rt->dst.tclassid &&
2626 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2627 goto nla_put_failure;
1da177e4 2628#endif
41347dcd 2629 if (!rt_is_input_route(rt) &&
d6c0a4f6 2630 fl4->saddr != src) {
930345ea 2631 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2632 goto nla_put_failure;
2633 }
155e8336 2634 if (rt->rt_uses_gateway &&
930345ea 2635 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2636 goto nla_put_failure;
be403ea1 2637
ee9a8f7a
SK
2638 expires = rt->dst.expires;
2639 if (expires) {
2640 unsigned long now = jiffies;
2641
2642 if (time_before(now, expires))
2643 expires -= now;
2644 else
2645 expires = 0;
2646 }
2647
521f5490 2648 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2649 if (rt->rt_pmtu && expires)
521f5490 2650 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
8387fbac
SD
2651 if (rt->rt_mtu_locked && expires)
2652 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2653 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2654 goto nla_put_failure;
2655
b4869889 2656 if (fl4->flowi4_mark &&
68aaed54 2657 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2658 goto nla_put_failure;
963bfeee 2659
622ec2c9
LC
2660 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2661 nla_put_u32(skb, RTA_UID,
2662 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2663 goto nla_put_failure;
2664
d8d1f30b 2665 error = rt->dst.error;
be403ea1 2666
c7537967 2667 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2668#ifdef CONFIG_IP_MROUTE
2669 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2670 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2671 int err = ipmr_get_route(net, skb,
2672 fl4->saddr, fl4->daddr,
9f09eaea 2673 r, portid);
2cf75070 2674
8caaf7b6 2675 if (err <= 0) {
0c8d803f
DA
2676 if (err == 0)
2677 return 0;
2678 goto nla_put_failure;
8caaf7b6
ND
2679 }
2680 } else
2681#endif
91146153 2682 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2683 goto nla_put_failure;
1da177e4
LT
2684 }
2685
f185071d 2686 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2687 goto nla_put_failure;
be403ea1 2688
053c095a
JB
2689 nlmsg_end(skb, nlh);
2690 return 0;
1da177e4 2691
be403ea1 2692nla_put_failure:
26932566
PM
2693 nlmsg_cancel(skb, nlh);
2694 return -EMSGSIZE;
1da177e4
LT
2695}
2696
c21ef3e3
DA
2697static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2698 struct netlink_ext_ack *extack)
1da177e4 2699{
3b1e0a65 2700 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2701 struct rtmsg *rtm;
2702 struct nlattr *tb[RTA_MAX+1];
3765d35e 2703 struct fib_result res = {};
1da177e4 2704 struct rtable *rt = NULL;
d6c0a4f6 2705 struct flowi4 fl4;
9e12bb22
AV
2706 __be32 dst = 0;
2707 __be32 src = 0;
2708 u32 iif;
d889ce3b 2709 int err;
963bfeee 2710 int mark;
1da177e4 2711 struct sk_buff *skb;
c36ba660 2712 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2713 kuid_t uid;
1da177e4 2714
fceb6435 2715 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2716 extack);
d889ce3b
TG
2717 if (err < 0)
2718 goto errout;
2719
2720 rtm = nlmsg_data(nlh);
2721
1da177e4 2722 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2723 if (!skb) {
d889ce3b
TG
2724 err = -ENOBUFS;
2725 goto errout;
2726 }
1da177e4
LT
2727
2728 /* Reserve room for dummy headers, this skb can pass
2729 through good chunk of routing engine.
2730 */
459a98ed 2731 skb_reset_mac_header(skb);
c1d2bbe1 2732 skb_reset_network_header(skb);
d2c962b8 2733
67b61f6c
JB
2734 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2735 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2736 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2737 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2738 if (tb[RTA_UID])
2739 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2740 else
2741 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2742
bbadb9a2
FL
2743 /* Bugfix: need to give ip_route_input enough of an IP header to
2744 * not gag.
2745 */
2746 ip_hdr(skb)->protocol = IPPROTO_UDP;
2747 ip_hdr(skb)->saddr = src;
2748 ip_hdr(skb)->daddr = dst;
2749
2750 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2751
d6c0a4f6
DM
2752 memset(&fl4, 0, sizeof(fl4));
2753 fl4.daddr = dst;
2754 fl4.saddr = src;
2755 fl4.flowi4_tos = rtm->rtm_tos;
2756 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2757 fl4.flowi4_mark = mark;
622ec2c9 2758 fl4.flowi4_uid = uid;
d6c0a4f6 2759
3765d35e
DA
2760 rcu_read_lock();
2761
1da177e4 2762 if (iif) {
d889ce3b
TG
2763 struct net_device *dev;
2764
3765d35e 2765 dev = dev_get_by_index_rcu(net, iif);
51456b29 2766 if (!dev) {
d889ce3b
TG
2767 err = -ENODEV;
2768 goto errout_free;
2769 }
2770
1da177e4
LT
2771 skb->protocol = htons(ETH_P_IP);
2772 skb->dev = dev;
963bfeee 2773 skb->mark = mark;
3765d35e
DA
2774 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2775 dev, &res);
d889ce3b 2776
511c3f92 2777 rt = skb_rtable(skb);
d8d1f30b
CG
2778 if (err == 0 && rt->dst.error)
2779 err = -rt->dst.error;
1da177e4 2780 } else {
c2fd0b21 2781 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3765d35e 2782 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2783 err = 0;
2784 if (IS_ERR(rt))
2785 err = PTR_ERR(rt);
2c87d63a
FW
2786 else
2787 skb_dst_set(skb, &rt->dst);
1da177e4 2788 }
d889ce3b 2789
1da177e4 2790 if (err)
d889ce3b 2791 goto errout_free;
1da177e4 2792
1da177e4
LT
2793 if (rtm->rtm_flags & RTM_F_NOTIFY)
2794 rt->rt_flags |= RTCF_NOTIFY;
2795
c36ba660
DA
2796 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2797 table_id = rt->rt_table_id;
2798
bc3aae2b
RP
2799 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2800 if (!res.fi) {
2801 err = fib_props[res.type].error;
2802 if (!err)
2803 err = -EHOSTUNREACH;
2804 goto errout_free;
2805 }
b6179813
RP
2806 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2807 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2808 rt->rt_type, res.prefix, res.prefixlen,
2809 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2810 } else {
b6179813 2811 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2812 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2813 }
7b46a644 2814 if (err < 0)
d889ce3b 2815 goto errout_free;
1da177e4 2816
3765d35e
DA
2817 rcu_read_unlock();
2818
15e47304 2819 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2820errout:
2942e900 2821 return err;
1da177e4 2822
d889ce3b 2823errout_free:
3765d35e 2824 rcu_read_unlock();
1da177e4 2825 kfree_skb(skb);
d889ce3b 2826 goto errout;
1da177e4
LT
2827}
2828
1da177e4
LT
2829void ip_rt_multicast_event(struct in_device *in_dev)
2830{
4ccfe6d4 2831 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2832}
2833
2834#ifdef CONFIG_SYSCTL
082c7ca4
G
2835static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2836static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2837static int ip_rt_gc_elasticity __read_mostly = 8;
2838
fe2c6338 2839static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2840 void __user *buffer,
1da177e4
LT
2841 size_t *lenp, loff_t *ppos)
2842{
5aad1de5
TT
2843 struct net *net = (struct net *)__ctl->extra1;
2844
1da177e4 2845 if (write) {
5aad1de5
TT
2846 rt_cache_flush(net);
2847 fnhe_genid_bump(net);
1da177e4 2848 return 0;
e905a9ed 2849 }
1da177e4
LT
2850
2851 return -EINVAL;
2852}
2853
fe2c6338 2854static struct ctl_table ipv4_route_table[] = {
1da177e4 2855 {
1da177e4
LT
2856 .procname = "gc_thresh",
2857 .data = &ipv4_dst_ops.gc_thresh,
2858 .maxlen = sizeof(int),
2859 .mode = 0644,
6d9f239a 2860 .proc_handler = proc_dointvec,
1da177e4
LT
2861 },
2862 {
1da177e4
LT
2863 .procname = "max_size",
2864 .data = &ip_rt_max_size,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
6d9f239a 2867 .proc_handler = proc_dointvec,
1da177e4
LT
2868 },
2869 {
2870 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2871
1da177e4
LT
2872 .procname = "gc_min_interval",
2873 .data = &ip_rt_gc_min_interval,
2874 .maxlen = sizeof(int),
2875 .mode = 0644,
6d9f239a 2876 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2877 },
2878 {
1da177e4
LT
2879 .procname = "gc_min_interval_ms",
2880 .data = &ip_rt_gc_min_interval,
2881 .maxlen = sizeof(int),
2882 .mode = 0644,
6d9f239a 2883 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2884 },
2885 {
1da177e4
LT
2886 .procname = "gc_timeout",
2887 .data = &ip_rt_gc_timeout,
2888 .maxlen = sizeof(int),
2889 .mode = 0644,
6d9f239a 2890 .proc_handler = proc_dointvec_jiffies,
1da177e4 2891 },
9f28a2fc
ED
2892 {
2893 .procname = "gc_interval",
2894 .data = &ip_rt_gc_interval,
2895 .maxlen = sizeof(int),
2896 .mode = 0644,
2897 .proc_handler = proc_dointvec_jiffies,
2898 },
1da177e4 2899 {
1da177e4
LT
2900 .procname = "redirect_load",
2901 .data = &ip_rt_redirect_load,
2902 .maxlen = sizeof(int),
2903 .mode = 0644,
6d9f239a 2904 .proc_handler = proc_dointvec,
1da177e4
LT
2905 },
2906 {
1da177e4
LT
2907 .procname = "redirect_number",
2908 .data = &ip_rt_redirect_number,
2909 .maxlen = sizeof(int),
2910 .mode = 0644,
6d9f239a 2911 .proc_handler = proc_dointvec,
1da177e4
LT
2912 },
2913 {
1da177e4
LT
2914 .procname = "redirect_silence",
2915 .data = &ip_rt_redirect_silence,
2916 .maxlen = sizeof(int),
2917 .mode = 0644,
6d9f239a 2918 .proc_handler = proc_dointvec,
1da177e4
LT
2919 },
2920 {
1da177e4
LT
2921 .procname = "error_cost",
2922 .data = &ip_rt_error_cost,
2923 .maxlen = sizeof(int),
2924 .mode = 0644,
6d9f239a 2925 .proc_handler = proc_dointvec,
1da177e4
LT
2926 },
2927 {
1da177e4
LT
2928 .procname = "error_burst",
2929 .data = &ip_rt_error_burst,
2930 .maxlen = sizeof(int),
2931 .mode = 0644,
6d9f239a 2932 .proc_handler = proc_dointvec,
1da177e4
LT
2933 },
2934 {
1da177e4
LT
2935 .procname = "gc_elasticity",
2936 .data = &ip_rt_gc_elasticity,
2937 .maxlen = sizeof(int),
2938 .mode = 0644,
6d9f239a 2939 .proc_handler = proc_dointvec,
1da177e4
LT
2940 },
2941 {
1da177e4
LT
2942 .procname = "mtu_expires",
2943 .data = &ip_rt_mtu_expires,
2944 .maxlen = sizeof(int),
2945 .mode = 0644,
6d9f239a 2946 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2947 },
2948 {
1da177e4
LT
2949 .procname = "min_pmtu",
2950 .data = &ip_rt_min_pmtu,
2951 .maxlen = sizeof(int),
2952 .mode = 0644,
3bcf69f8
SD
2953 .proc_handler = proc_dointvec_minmax,
2954 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
2955 },
2956 {
1da177e4
LT
2957 .procname = "min_adv_mss",
2958 .data = &ip_rt_min_advmss,
2959 .maxlen = sizeof(int),
2960 .mode = 0644,
6d9f239a 2961 .proc_handler = proc_dointvec,
1da177e4 2962 },
f8572d8f 2963 { }
1da177e4 2964};
39a23e75 2965
39a23e75
DL
2966static struct ctl_table ipv4_route_flush_table[] = {
2967 {
39a23e75
DL
2968 .procname = "flush",
2969 .maxlen = sizeof(int),
2970 .mode = 0200,
6d9f239a 2971 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2972 },
f8572d8f 2973 { },
39a23e75
DL
2974};
2975
2976static __net_init int sysctl_route_net_init(struct net *net)
2977{
2978 struct ctl_table *tbl;
2979
2980 tbl = ipv4_route_flush_table;
09ad9bc7 2981 if (!net_eq(net, &init_net)) {
39a23e75 2982 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2983 if (!tbl)
39a23e75 2984 goto err_dup;
464dc801
EB
2985
2986 /* Don't export sysctls to unprivileged users */
2987 if (net->user_ns != &init_user_ns)
2988 tbl[0].procname = NULL;
39a23e75
DL
2989 }
2990 tbl[0].extra1 = net;
2991
ec8f23ce 2992 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2993 if (!net->ipv4.route_hdr)
39a23e75
DL
2994 goto err_reg;
2995 return 0;
2996
2997err_reg:
2998 if (tbl != ipv4_route_flush_table)
2999 kfree(tbl);
3000err_dup:
3001 return -ENOMEM;
3002}
3003
3004static __net_exit void sysctl_route_net_exit(struct net *net)
3005{
3006 struct ctl_table *tbl;
3007
3008 tbl = net->ipv4.route_hdr->ctl_table_arg;
3009 unregister_net_sysctl_table(net->ipv4.route_hdr);
3010 BUG_ON(tbl == ipv4_route_flush_table);
3011 kfree(tbl);
3012}
3013
3014static __net_initdata struct pernet_operations sysctl_route_ops = {
3015 .init = sysctl_route_net_init,
3016 .exit = sysctl_route_net_exit,
3017};
1da177e4
LT
3018#endif
3019
3ee94372 3020static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3021{
ca4c3fc2 3022 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3023 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3024 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3025 return 0;
3026}
3027
3ee94372
NH
3028static __net_initdata struct pernet_operations rt_genid_ops = {
3029 .init = rt_genid_init,
9f5e97e5
DL
3030};
3031
c3426b47
DM
3032static int __net_init ipv4_inetpeer_init(struct net *net)
3033{
3034 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3035
3036 if (!bp)
3037 return -ENOMEM;
3038 inet_peer_base_init(bp);
3039 net->ipv4.peers = bp;
3040 return 0;
3041}
3042
3043static void __net_exit ipv4_inetpeer_exit(struct net *net)
3044{
3045 struct inet_peer_base *bp = net->ipv4.peers;
3046
3047 net->ipv4.peers = NULL;
56a6b248 3048 inetpeer_invalidate_tree(bp);
c3426b47
DM
3049 kfree(bp);
3050}
3051
3052static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3053 .init = ipv4_inetpeer_init,
3054 .exit = ipv4_inetpeer_exit,
3055};
9f5e97e5 3056
c7066f70 3057#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3058struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3059#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3060
1da177e4
LT
3061int __init ip_rt_init(void)
3062{
424c4b70 3063 int rc = 0;
5055c371 3064 int cpu;
1da177e4 3065
73f156a6
ED
3066 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3067 if (!ip_idents)
3068 panic("IP: failed to allocate ip_idents\n");
3069
3070 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3071
355b590c
ED
3072 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3073 if (!ip_tstamps)
3074 panic("IP: failed to allocate ip_tstamps\n");
3075
5055c371
ED
3076 for_each_possible_cpu(cpu) {
3077 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3078
3079 INIT_LIST_HEAD(&ul->head);
3080 spin_lock_init(&ul->lock);
3081 }
c7066f70 3082#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3083 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3084 if (!ip_rt_acct)
3085 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3086#endif
3087
e5d679f3
AD
3088 ipv4_dst_ops.kmem_cachep =
3089 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3090 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3091
14e50e57
DM
3092 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3093
fc66f95c
ED
3094 if (dst_entries_init(&ipv4_dst_ops) < 0)
3095 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3096
3097 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3098 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3099
89aef892
DM
3100 ipv4_dst_ops.gc_thresh = ~0;
3101 ip_rt_max_size = INT_MAX;
1da177e4 3102
1da177e4
LT
3103 devinet_init();
3104 ip_fib_init();
3105
73b38711 3106 if (ip_rt_proc_init())
058bd4d2 3107 pr_err("Unable to create route proc files\n");
1da177e4
LT
3108#ifdef CONFIG_XFRM
3109 xfrm_init();
703fb94e 3110 xfrm4_init();
1da177e4 3111#endif
394f51ab
FW
3112 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3113 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3114
39a23e75
DL
3115#ifdef CONFIG_SYSCTL
3116 register_pernet_subsys(&sysctl_route_ops);
3117#endif
3ee94372 3118 register_pernet_subsys(&rt_genid_ops);
c3426b47 3119 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3120 return rc;
3121}
3122
a1bc6eb4 3123#ifdef CONFIG_SYSCTL
eeb61f71
AV
3124/*
3125 * We really need to sanitize the damn ipv4 init order, then all
3126 * this nonsense will go away.
3127 */
3128void __init ip_static_sysctl_init(void)
3129{
4e5ca785 3130 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3131}
a1bc6eb4 3132#endif