net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 144 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 148                                            struct sk_buff *skb, u32 mtu);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         return NULL;
 281
 282 }
 283
 284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285 {
 286
 287 }
 288
 289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290 {
 291         struct rt_cache_stat *st = v;
 292
 293         if (v == SEQ_START_TOKEN) {
 294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                 return 0;
 296         }
 297
 298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                    dst_entries_get_slow(&ipv4_dst_ops),
 301                    0, /* st->in_hit */
 302                    st->in_slow_tot,
 303                    st->in_slow_mc,
 304                    st->in_no_route,
 305                    st->in_brd,
 306                    st->in_martian_dst,
 307                    st->in_martian_src,
 308
 309                    0, /* st->out_hit */
 310                    st->out_slow_tot,
 311                    st->out_slow_mc,
 312
 313                    0, /* st->gc_total */
 314                    0, /* st->gc_ignored */
 315                    0, /* st->gc_goal_miss */
 316                    0, /* st->gc_dst_overflow */
 317                    0, /* st->in_hlist_search */
 318                    0  /* st->out_hlist_search */
 319                 );
 320         return 0;
 321 }
 322
 323 static const struct seq_operations rt_cpu_seq_ops = {
 324         .start  = rt_cpu_seq_start,
 325         .next   = rt_cpu_seq_next,
 326         .stop   = rt_cpu_seq_stop,
 327         .show   = rt_cpu_seq_show,
 328 };
 329
 330
 331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332 {
 333         return seq_open(file, &rt_cpu_seq_ops);
 334 }
 335
 336 static const struct file_operations rt_cpu_seq_fops = {
 337         .owner   = THIS_MODULE,
 338         .open    = rt_cpu_seq_open,
 339         .read    = seq_read,
 340         .llseek  = seq_lseek,
 341         .release = seq_release,
 342 };
 343
 344 #ifdef CONFIG_IP_ROUTE_CLASSID
 345 static int rt_acct_proc_show(struct seq_file *m, void *v)
 346 {
 347         struct ip_rt_acct *dst, *src;
 348         unsigned int i, j;
 349
 350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351         if (!dst)
 352                 return -ENOMEM;
 353
 354         for_each_possible_cpu(i) {
 355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                 for (j = 0; j < 256; j++) {
 357                         dst[j].o_bytes   += src[j].o_bytes;
 358                         dst[j].o_packets += src[j].o_packets;
 359                         dst[j].i_bytes   += src[j].i_bytes;
 360                         dst[j].i_packets += src[j].i_packets;
 361                 }
 362         }
 363
 364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365         kfree(dst);
 366         return 0;
 367 }
 368
 369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370 {
 371         return single_open(file, rt_acct_proc_show, NULL);
 372 }
 373
 374 static const struct file_operations rt_acct_proc_fops = {
 375         .owner          = THIS_MODULE,
 376         .open           = rt_acct_proc_open,
 377         .read           = seq_read,
 378         .llseek         = seq_lseek,
 379         .release        = single_release,
 380 };
 381 #endif
 382
 383 static int __net_init ip_rt_do_proc_init(struct net *net)
 384 {
 385         struct proc_dir_entry *pde;
 386
 387         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 388                           &rt_cache_seq_fops);
 389         if (!pde)
 390                 goto err1;
 391
 392         pde = proc_create("rt_cache", S_IRUGO,
 393                           net->proc_net_stat, &rt_cpu_seq_fops);
 394         if (!pde)
 395                 goto err2;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399         if (!pde)
 400                 goto err3;
 401 #endif
 402         return 0;
 403
 404 #ifdef CONFIG_IP_ROUTE_CLASSID
 405 err3:
 406         remove_proc_entry("rt_cache", net->proc_net_stat);
 407 #endif
 408 err2:
 409         remove_proc_entry("rt_cache", net->proc_net);
 410 err1:
 411         return -ENOMEM;
 412 }
 413
 414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415 {
 416         remove_proc_entry("rt_cache", net->proc_net_stat);
 417         remove_proc_entry("rt_cache", net->proc_net);
 418 #ifdef CONFIG_IP_ROUTE_CLASSID
 419         remove_proc_entry("rt_acct", net->proc_net);
 420 #endif
 421 }
 422
 423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424         .init = ip_rt_do_proc_init,
 425         .exit = ip_rt_do_proc_exit,
 426 };
 427
 428 static int __init ip_rt_proc_init(void)
 429 {
 430         return register_pernet_subsys(&ip_rt_proc_ops);
 431 }
 432
 433 #else
 434 static inline int ip_rt_proc_init(void)
 435 {
 436         return 0;
 437 }
 438 #endif /* CONFIG_PROC_FS */
 439
 440 static inline bool rt_is_expired(const struct rtable *rth)
 441 {
 442         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 443 }
 444
 445 void rt_cache_flush(struct net *net)
 446 {
 447         rt_genid_bump_ipv4(net);
 448 }
 449
 450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                            struct sk_buff *skb,
 452                                            const void *daddr)
 453 {
 454         struct net_device *dev = dst->dev;
 455         const __be32 *pkey = daddr;
 456         const struct rtable *rt;
 457         struct neighbour *n;
 458
 459         rt = (const struct rtable *) dst;
 460         if (rt->rt_gateway)
 461                 pkey = (const __be32 *) &rt->rt_gateway;
 462         else if (skb)
 463                 pkey = &ip_hdr(skb)->daddr;
 464
 465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466         if (n)
 467                 return n;
 468         return neigh_create(&arp_tbl, pkey, dev);
 469 }
 470
 471 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 472 {
 473         struct net_device *dev = dst->dev;
 474         const __be32 *pkey = daddr;
 475         const struct rtable *rt;
 476
 477         rt = (const struct rtable *)dst;
 478         if (rt->rt_gateway)
 479                 pkey = (const __be32 *)&rt->rt_gateway;
 480         else if (!daddr ||
 481                  (rt->rt_flags &
 482                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 483                 return;
 484
 485         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 486 }
 487
 488 #define IP_IDENTS_SZ 2048u
 489
 490 static atomic_t *ip_idents __read_mostly;
 491 static u32 *ip_tstamps __read_mostly;
 492
 493 /* In order to protect privacy, we add a perturbation to identifiers
 494  * if one generator is seldom used. This makes hard for an attacker
 495  * to infer how many packets were sent between two points in time.
 496  */
 497 u32 ip_idents_reserve(u32 hash, int segs)
 498 {
 499         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 500         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 501         u32 old = ACCESS_ONCE(*p_tstamp);
 502         u32 now = (u32)jiffies;
 503         u32 new, delta = 0;
 504
 505         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 506                 delta = prandom_u32_max(now - old);
 507
 508         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 509         do {
 510                 old = (u32)atomic_read(p_id);
 511                 new = old + delta + segs;
 512         } while (atomic_cmpxchg(p_id, old, new) != old);
 513
 514         return new - segs;
 515 }
 516 EXPORT_SYMBOL(ip_idents_reserve);
 517
 518 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 519 {
 520         u32 hash, id;
 521
 522         /* Note the following code is not safe, but this is okay. */
 523         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 524                 get_random_bytes(&net->ipv4.ip_id_key,
 525                                  sizeof(net->ipv4.ip_id_key));
 526
 527         hash = siphash_3u32((__force u32)iph->daddr,
 528                             (__force u32)iph->saddr,
 529                             iph->protocol,
 530                             &net->ipv4.ip_id_key);
 531         id = ip_idents_reserve(hash, segs);
 532         iph->id = htons(id);
 533 }
 534 EXPORT_SYMBOL(__ip_select_ident);
 535
 536 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 537                              const struct sock *sk,
 538                              const struct iphdr *iph,
 539                              int oif, u8 tos,
 540                              u8 prot, u32 mark, int flow_flags)
 541 {
 542         if (sk) {
 543                 const struct inet_sock *inet = inet_sk(sk);
 544
 545                 oif = sk->sk_bound_dev_if;
 546                 mark = sk->sk_mark;
 547                 tos = RT_CONN_FLAGS(sk);
 548                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 549         }
 550         flowi4_init_output(fl4, oif, mark, tos,
 551                            RT_SCOPE_UNIVERSE, prot,
 552                            flow_flags,
 553                            iph->daddr, iph->saddr, 0, 0,
 554                            sock_net_uid(net, sk));
 555 }
 556
 557 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 558                                const struct sock *sk)
 559 {
 560         const struct net *net = dev_net(skb->dev);
 561         const struct iphdr *iph = ip_hdr(skb);
 562         int oif = skb->dev->ifindex;
 563         u8 tos = RT_TOS(iph->tos);
 564         u8 prot = iph->protocol;
 565         u32 mark = skb->mark;
 566
 567         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 568 }
 569
 570 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 571 {
 572         const struct inet_sock *inet = inet_sk(sk);
 573         const struct ip_options_rcu *inet_opt;
 574         __be32 daddr = inet->inet_daddr;
 575
 576         rcu_read_lock();
 577         inet_opt = rcu_dereference(inet->inet_opt);
 578         if (inet_opt && inet_opt->opt.srr)
 579                 daddr = inet_opt->opt.faddr;
 580         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 581                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 582                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 583                            inet_sk_flowi_flags(sk),
 584                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 585         rcu_read_unlock();
 586 }
 587
 588 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 589                                  const struct sk_buff *skb)
 590 {
 591         if (skb)
 592                 build_skb_flow_key(fl4, skb, sk);
 593         else
 594                 build_sk_flow_key(fl4, sk);
 595 }
 596
 597 static DEFINE_SPINLOCK(fnhe_lock);
 598
 599 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 600 {
 601         struct rtable *rt;
 602
 603         rt = rcu_dereference(fnhe->fnhe_rth_input);
 604         if (rt) {
 605                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 606                 dst_dev_put(&rt->dst);
 607                 dst_release(&rt->dst);
 608         }
 609         rt = rcu_dereference(fnhe->fnhe_rth_output);
 610         if (rt) {
 611                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 612                 dst_dev_put(&rt->dst);
 613                 dst_release(&rt->dst);
 614         }
 615 }
 616
 617 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 618 {
 619         struct fib_nh_exception *fnhe, *oldest;
 620
 621         oldest = rcu_dereference(hash->chain);
 622         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 623              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 624                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 625                         oldest = fnhe;
 626         }
 627         fnhe_flush_routes(oldest);
 628         return oldest;
 629 }
 630
 631 static inline u32 fnhe_hashfun(__be32 daddr)
 632 {
 633         static u32 fnhe_hashrnd __read_mostly;
 634         u32 hval;
 635
 636         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 637         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 638         return hash_32(hval, FNHE_HASH_SHIFT);
 639 }
 640
 641 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 642 {
 643         rt->rt_pmtu = fnhe->fnhe_pmtu;
 644         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 645         rt->dst.expires = fnhe->fnhe_expires;
 646
 647         if (fnhe->fnhe_gw) {
 648                 rt->rt_flags |= RTCF_REDIRECTED;
 649                 rt->rt_gateway = fnhe->fnhe_gw;
 650                 rt->rt_uses_gateway = 1;
 651         }
 652 }
 653
 654 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 655                                   u32 pmtu, bool lock, unsigned long expires)
 656 {
 657         struct fnhe_hash_bucket *hash;
 658         struct fib_nh_exception *fnhe;
 659         struct rtable *rt;
 660         u32 genid, hval;
 661         unsigned int i;
 662         int depth;
 663
 664         genid = fnhe_genid(dev_net(nh->nh_dev));
 665         hval = fnhe_hashfun(daddr);
 666
 667         spin_lock_bh(&fnhe_lock);
 668
 669         hash = rcu_dereference(nh->nh_exceptions);
 670         if (!hash) {
 671                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 672                 if (!hash)
 673                         goto out_unlock;
 674                 rcu_assign_pointer(nh->nh_exceptions, hash);
 675         }
 676
 677         hash += hval;
 678
 679         depth = 0;
 680         for (fnhe = rcu_dereference(hash->chain); fnhe;
 681              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 682                 if (fnhe->fnhe_daddr == daddr)
 683                         break;
 684                 depth++;
 685         }
 686
 687         if (fnhe) {
 688                 if (fnhe->fnhe_genid != genid)
 689                         fnhe->fnhe_genid = genid;
 690                 if (gw)
 691                         fnhe->fnhe_gw = gw;
 692                 if (pmtu) {
 693                         fnhe->fnhe_pmtu = pmtu;
 694                         fnhe->fnhe_mtu_locked = lock;
 695                 }
 696                 fnhe->fnhe_expires = max(1UL, expires);
 697                 /* Update all cached dsts too */
 698                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 699                 if (rt)
 700                         fill_route_from_fnhe(rt, fnhe);
 701                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 702                 if (rt)
 703                         fill_route_from_fnhe(rt, fnhe);
 704         } else {
 705                 if (depth > FNHE_RECLAIM_DEPTH)
 706                         fnhe = fnhe_oldest(hash);
 707                 else {
 708                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 709                         if (!fnhe)
 710                                 goto out_unlock;
 711
 712                         fnhe->fnhe_next = hash->chain;
 713                         rcu_assign_pointer(hash->chain, fnhe);
 714                 }
 715                 fnhe->fnhe_genid = genid;
 716                 fnhe->fnhe_daddr = daddr;
 717                 fnhe->fnhe_gw = gw;
 718                 fnhe->fnhe_pmtu = pmtu;
 719                 fnhe->fnhe_mtu_locked = lock;
 720                 fnhe->fnhe_expires = max(1UL, expires);
 721
 722                 /* Exception created; mark the cached routes for the nexthop
 723                  * stale, so anyone caching it rechecks if this exception
 724                  * applies to them.
 725                  */
 726                 rt = rcu_dereference(nh->nh_rth_input);
 727                 if (rt)
 728                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 729
 730                 for_each_possible_cpu(i) {
 731                         struct rtable __rcu **prt;
 732                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 733                         rt = rcu_dereference(*prt);
 734                         if (rt)
 735                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 736                 }
 737         }
 738
 739         fnhe->fnhe_stamp = jiffies;
 740
 741 out_unlock:
 742         spin_unlock_bh(&fnhe_lock);
 743 }
 744
 745 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 746                              bool kill_route)
 747 {
 748         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 749         __be32 old_gw = ip_hdr(skb)->saddr;
 750         struct net_device *dev = skb->dev;
 751         struct in_device *in_dev;
 752         struct fib_result res;
 753         struct neighbour *n;
 754         struct net *net;
 755
 756         switch (icmp_hdr(skb)->code & 7) {
 757         case ICMP_REDIR_NET:
 758         case ICMP_REDIR_NETTOS:
 759         case ICMP_REDIR_HOST:
 760         case ICMP_REDIR_HOSTTOS:
 761                 break;
 762
 763         default:
 764                 return;
 765         }
 766
 767         if (rt->rt_gateway != old_gw)
 768                 return;
 769
 770         in_dev = __in_dev_get_rcu(dev);
 771         if (!in_dev)
 772                 return;
 773
 774         net = dev_net(dev);
 775         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 776             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 777             ipv4_is_zeronet(new_gw))
 778                 goto reject_redirect;
 779
 780         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 781                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 782                         goto reject_redirect;
 783                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 784                         goto reject_redirect;
 785         } else {
 786                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 787                         goto reject_redirect;
 788         }
 789
 790         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 791         if (!n)
 792                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 793         if (!IS_ERR(n)) {
 794                 if (!(n->nud_state & NUD_VALID)) {
 795                         neigh_event_send(n, NULL);
 796                 } else {
 797                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 798                                 struct fib_nh *nh = &FIB_RES_NH(res);
 799
 800                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 801                                                 0, false,
 802                                                 jiffies + ip_rt_gc_timeout);
 803                         }
 804                         if (kill_route)
 805                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 806                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 807                 }
 808                 neigh_release(n);
 809         }
 810         return;
 811
 812 reject_redirect:
 813 #ifdef CONFIG_IP_ROUTE_VERBOSE
 814         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 815                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 816                 __be32 daddr = iph->daddr;
 817                 __be32 saddr = iph->saddr;
 818
 819                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 820                                      "  Advised path = %pI4 -> %pI4\n",
 821                                      &old_gw, dev->name, &new_gw,
 822                                      &saddr, &daddr);
 823         }
 824 #endif
 825         ;
 826 }
 827
 828 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 829 {
 830         struct rtable *rt;
 831         struct flowi4 fl4;
 832         const struct iphdr *iph = (const struct iphdr *) skb->data;
 833         struct net *net = dev_net(skb->dev);
 834         int oif = skb->dev->ifindex;
 835         u8 tos = RT_TOS(iph->tos);
 836         u8 prot = iph->protocol;
 837         u32 mark = skb->mark;
 838
 839         rt = (struct rtable *) dst;
 840
 841         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 842         __ip_do_redirect(rt, skb, &fl4, true);
 843 }
 844
 845 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 846 {
 847         struct rtable *rt = (struct rtable *)dst;
 848         struct dst_entry *ret = dst;
 849
 850         if (rt) {
 851                 if (dst->obsolete > 0) {
 852                         ip_rt_put(rt);
 853                         ret = NULL;
 854                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 855                            rt->dst.expires) {
 856                         ip_rt_put(rt);
 857                         ret = NULL;
 858                 }
 859         }
 860         return ret;
 861 }
 862
 863 /*
 864  * Algorithm:
 865  *      1. The first ip_rt_redirect_number redirects are sent
 866  *         with exponential backoff, then we stop sending them at all,
 867  *         assuming that the host ignores our redirects.
 868  *      2. If we did not see packets requiring redirects
 869  *         during ip_rt_redirect_silence, we assume that the host
 870  *         forgot redirected route and start to send redirects again.
 871  *
 872  * This algorithm is much cheaper and more intelligent than dumb load limiting
 873  * in icmp.c.
 874  *
 875  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 876  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 877  */
 878
 879 void ip_rt_send_redirect(struct sk_buff *skb)
 880 {
 881         struct rtable *rt = skb_rtable(skb);
 882         struct in_device *in_dev;
 883         struct inet_peer *peer;
 884         struct net *net;
 885         int log_martians;
 886         int vif;
 887
 888         rcu_read_lock();
 889         in_dev = __in_dev_get_rcu(rt->dst.dev);
 890         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 891                 rcu_read_unlock();
 892                 return;
 893         }
 894         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 895         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 896         rcu_read_unlock();
 897
 898         net = dev_net(rt->dst.dev);
 899         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 900         if (!peer) {
 901                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 902                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 903                 return;
 904         }
 905
 906         /* No redirected packets during ip_rt_redirect_silence;
 907          * reset the algorithm.
 908          */
 909         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 910                 peer->rate_tokens = 0;
 911                 peer->n_redirects = 0;
 912         }
 913
 914         /* Too many ignored redirects; do not send anything
 915          * set dst.rate_last to the last seen redirected packet.
 916          */
 917         if (peer->n_redirects >= ip_rt_redirect_number) {
 918                 peer->rate_last = jiffies;
 919                 goto out_put_peer;
 920         }
 921
 922         /* Check for load limit; set rate_last to the latest sent
 923          * redirect.
 924          */
 925         if (peer->rate_tokens == 0 ||
 926             time_after(jiffies,
 927                        (peer->rate_last +
 928                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 929                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 930
 931                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 932                 peer->rate_last = jiffies;
 933                 ++peer->rate_tokens;
 934                 ++peer->n_redirects;
 935 #ifdef CONFIG_IP_ROUTE_VERBOSE
 936                 if (log_martians &&
 937                     peer->rate_tokens == ip_rt_redirect_number)
 938                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 939                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 940                                              &ip_hdr(skb)->daddr, &gw);
 941 #endif
 942         }
 943 out_put_peer:
 944         inet_putpeer(peer);
 945 }
 946
 947 static int ip_error(struct sk_buff *skb)
 948 {
 949         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 950         struct rtable *rt = skb_rtable(skb);
 951         struct inet_peer *peer;
 952         unsigned long now;
 953         struct net *net;
 954         bool send;
 955         int code;
 956
 957         /* IP on this device is disabled. */
 958         if (!in_dev)
 959                 goto out;
 960
 961         net = dev_net(rt->dst.dev);
 962         if (!IN_DEV_FORWARD(in_dev)) {
 963                 switch (rt->dst.error) {
 964                 case EHOSTUNREACH:
 965                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 966                         break;
 967
 968                 case ENETUNREACH:
 969                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 970                         break;
 971                 }
 972                 goto out;
 973         }
 974
 975         switch (rt->dst.error) {
 976         case EINVAL:
 977         default:
 978                 goto out;
 979         case EHOSTUNREACH:
 980                 code = ICMP_HOST_UNREACH;
 981                 break;
 982         case ENETUNREACH:
 983                 code = ICMP_NET_UNREACH;
 984                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 985                 break;
 986         case EACCES:
 987                 code = ICMP_PKT_FILTERED;
 988                 break;
 989         }
 990
 991         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 992                                l3mdev_master_ifindex(skb->dev), 1);
 993
 994         send = true;
 995         if (peer) {
 996                 now = jiffies;
 997                 peer->rate_tokens += now - peer->rate_last;
 998                 if (peer->rate_tokens > ip_rt_error_burst)
 999                         peer->rate_tokens = ip_rt_error_burst;
1000                 peer->rate_last = now;
1001                 if (peer->rate_tokens >= ip_rt_error_cost)
1002                         peer->rate_tokens -= ip_rt_error_cost;
1003                 else
1004                         send = false;
1005                 inet_putpeer(peer);
1006         }
1007         if (send)
1008                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009
1010 out:    kfree_skb(skb);
1011         return 0;
1012 }
1013
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016         struct dst_entry *dst = &rt->dst;
1017         struct fib_result res;
1018         bool lock = false;
1019
1020         if (ip_mtu_locked(dst))
1021                 return;
1022
1023         if (ipv4_mtu(dst) < mtu)
1024                 return;
1025
1026         if (mtu < ip_rt_min_pmtu) {
1027                 lock = true;
1028                 mtu = ip_rt_min_pmtu;
1029         }
1030
1031         if (rt->rt_pmtu == mtu &&
1032             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033                 return;
1034
1035         rcu_read_lock();
1036         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037                 struct fib_nh *nh = &FIB_RES_NH(res);
1038
1039                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1040                                       jiffies + ip_rt_mtu_expires);
1041         }
1042         rcu_read_unlock();
1043 }
1044
1045 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046                               struct sk_buff *skb, u32 mtu)
1047 {
1048         struct rtable *rt = (struct rtable *) dst;
1049         struct flowi4 fl4;
1050
1051         ip_rt_build_flow_key(&fl4, sk, skb);
1052         __ip_rt_update_pmtu(rt, &fl4, mtu);
1053 }
1054
1055 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1056                       int oif, u32 mark, u8 protocol, int flow_flags)
1057 {
1058         const struct iphdr *iph = (const struct iphdr *) skb->data;
1059         struct flowi4 fl4;
1060         struct rtable *rt;
1061
1062         if (!mark)
1063                 mark = IP4_REPLY_MARK(net, skb->mark);
1064
1065         __build_flow_key(net, &fl4, NULL, iph, oif,
1066                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1067         rt = __ip_route_output_key(net, &fl4);
1068         if (!IS_ERR(rt)) {
1069                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070                 ip_rt_put(rt);
1071         }
1072 }
1073 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1074
1075 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1076 {
1077         const struct iphdr *iph = (const struct iphdr *) skb->data;
1078         struct flowi4 fl4;
1079         struct rtable *rt;
1080
1081         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1082
1083         if (!fl4.flowi4_mark)
1084                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1085
1086         rt = __ip_route_output_key(sock_net(sk), &fl4);
1087         if (!IS_ERR(rt)) {
1088                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1089                 ip_rt_put(rt);
1090         }
1091 }
1092
1093 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1094 {
1095         const struct iphdr *iph = (const struct iphdr *) skb->data;
1096         struct flowi4 fl4;
1097         struct rtable *rt;
1098         struct dst_entry *odst = NULL;
1099         bool new = false;
1100         struct net *net = sock_net(sk);
1101
1102         bh_lock_sock(sk);
1103
1104         if (!ip_sk_accept_pmtu(sk))
1105                 goto out;
1106
1107         odst = sk_dst_get(sk);
1108
1109         if (sock_owned_by_user(sk) || !odst) {
1110                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1111                 goto out;
1112         }
1113
1114         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1115
1116         rt = (struct rtable *)odst;
1117         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1118                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119                 if (IS_ERR(rt))
1120                         goto out;
1121
1122                 new = true;
1123         }
1124
1125         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1126
1127         if (!dst_check(&rt->dst, 0)) {
1128                 if (new)
1129                         dst_release(&rt->dst);
1130
1131                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1132                 if (IS_ERR(rt))
1133                         goto out;
1134
1135                 new = true;
1136         }
1137
1138         if (new)
1139                 sk_dst_set(sk, &rt->dst);
1140
1141 out:
1142         bh_unlock_sock(sk);
1143         dst_release(odst);
1144 }
1145 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1146
1147 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1148                    int oif, u32 mark, u8 protocol, int flow_flags)
1149 {
1150         const struct iphdr *iph = (const struct iphdr *) skb->data;
1151         struct flowi4 fl4;
1152         struct rtable *rt;
1153
1154         __build_flow_key(net, &fl4, NULL, iph, oif,
1155                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1156         rt = __ip_route_output_key(net, &fl4);
1157         if (!IS_ERR(rt)) {
1158                 __ip_do_redirect(rt, skb, &fl4, false);
1159                 ip_rt_put(rt);
1160         }
1161 }
1162 EXPORT_SYMBOL_GPL(ipv4_redirect);
1163
1164 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1165 {
1166         const struct iphdr *iph = (const struct iphdr *) skb->data;
1167         struct flowi4 fl4;
1168         struct rtable *rt;
1169         struct net *net = sock_net(sk);
1170
1171         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1172         rt = __ip_route_output_key(net, &fl4);
1173         if (!IS_ERR(rt)) {
1174                 __ip_do_redirect(rt, skb, &fl4, false);
1175                 ip_rt_put(rt);
1176         }
1177 }
1178 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1179
1180 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1181 {
1182         struct rtable *rt = (struct rtable *) dst;
1183
1184         /* All IPV4 dsts are created with ->obsolete set to the value
1185          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1186          * into this function always.
1187          *
1188          * When a PMTU/redirect information update invalidates a route,
1189          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1190          * DST_OBSOLETE_DEAD by dst_free().
1191          */
1192         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1193                 return NULL;
1194         return dst;
1195 }
1196
1197 static void ipv4_link_failure(struct sk_buff *skb)
1198 {
1199         struct rtable *rt;
1200
1201         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1202
1203         rt = skb_rtable(skb);
1204         if (rt)
1205                 dst_set_expires(&rt->dst, 0);
1206 }
1207
1208 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1209 {
1210         pr_debug("%s: %pI4 -> %pI4, %s\n",
1211                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1212                  skb->dev ? skb->dev->name : "?");
1213         kfree_skb(skb);
1214         WARN_ON(1);
1215         return 0;
1216 }
1217
1218 /*
1219    We do not cache source address of outgoing interface,
1220    because it is used only by IP RR, TS and SRR options,
1221    so that it out of fast path.
1222
1223    BTW remember: "addr" is allowed to be not aligned
1224    in IP options!
1225  */
1226
1227 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1228 {
1229         __be32 src;
1230
1231         if (rt_is_output_route(rt))
1232                 src = ip_hdr(skb)->saddr;
1233         else {
1234                 struct fib_result res;
1235                 struct flowi4 fl4;
1236                 struct iphdr *iph;
1237
1238                 iph = ip_hdr(skb);
1239
1240                 memset(&fl4, 0, sizeof(fl4));
1241                 fl4.daddr = iph->daddr;
1242                 fl4.saddr = iph->saddr;
1243                 fl4.flowi4_tos = RT_TOS(iph->tos);
1244                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1245                 fl4.flowi4_iif = skb->dev->ifindex;
1246                 fl4.flowi4_mark = skb->mark;
1247
1248                 rcu_read_lock();
1249                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1250                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1251                 else
1252                         src = inet_select_addr(rt->dst.dev,
1253                                                rt_nexthop(rt, iph->daddr),
1254                                                RT_SCOPE_UNIVERSE);
1255                 rcu_read_unlock();
1256         }
1257         memcpy(addr, &src, 4);
1258 }
1259
1260 #ifdef CONFIG_IP_ROUTE_CLASSID
1261 static void set_class_tag(struct rtable *rt, u32 tag)
1262 {
1263         if (!(rt->dst.tclassid & 0xFFFF))
1264                 rt->dst.tclassid |= tag & 0xFFFF;
1265         if (!(rt->dst.tclassid & 0xFFFF0000))
1266                 rt->dst.tclassid |= tag & 0xFFFF0000;
1267 }
1268 #endif
1269
1270 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1271 {
1272         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1273         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1274                                     ip_rt_min_advmss);
1275
1276         return min(advmss, IPV4_MAX_PMTU - header_size);
1277 }
1278
1279 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1280 {
1281         const struct rtable *rt = (const struct rtable *) dst;
1282         unsigned int mtu = rt->rt_pmtu;
1283
1284         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1285                 mtu = dst_metric_raw(dst, RTAX_MTU);
1286
1287         if (mtu)
1288                 return mtu;
1289
1290         mtu = READ_ONCE(dst->dev->mtu);
1291
1292         if (unlikely(ip_mtu_locked(dst))) {
1293                 if (rt->rt_uses_gateway && mtu > 576)
1294                         mtu = 576;
1295         }
1296
1297         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1298
1299         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1300 }
1301
1302 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1303 {
1304         struct fnhe_hash_bucket *hash;
1305         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1306         u32 hval = fnhe_hashfun(daddr);
1307
1308         spin_lock_bh(&fnhe_lock);
1309
1310         hash = rcu_dereference_protected(nh->nh_exceptions,
1311                                          lockdep_is_held(&fnhe_lock));
1312         hash += hval;
1313
1314         fnhe_p = &hash->chain;
1315         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1316         while (fnhe) {
1317                 if (fnhe->fnhe_daddr == daddr) {
1318                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1319                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1320                         /* set fnhe_daddr to 0 to ensure it won't bind with
1321                          * new dsts in rt_bind_exception().
1322                          */
1323                         fnhe->fnhe_daddr = 0;
1324                         fnhe_flush_routes(fnhe);
1325                         kfree_rcu(fnhe, rcu);
1326                         break;
1327                 }
1328                 fnhe_p = &fnhe->fnhe_next;
1329                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1330                                                  lockdep_is_held(&fnhe_lock));
1331         }
1332
1333         spin_unlock_bh(&fnhe_lock);
1334 }
1335
1336 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1337 {
1338         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1339         struct fib_nh_exception *fnhe;
1340         u32 hval;
1341
1342         if (!hash)
1343                 return NULL;
1344
1345         hval = fnhe_hashfun(daddr);
1346
1347         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1348              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1349                 if (fnhe->fnhe_daddr == daddr) {
1350                         if (fnhe->fnhe_expires &&
1351                             time_after(jiffies, fnhe->fnhe_expires)) {
1352                                 ip_del_fnhe(nh, daddr);
1353                                 break;
1354                         }
1355                         return fnhe;
1356                 }
1357         }
1358         return NULL;
1359 }
1360
1361 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1362                               __be32 daddr, const bool do_cache)
1363 {
1364         bool ret = false;
1365
1366         spin_lock_bh(&fnhe_lock);
1367
1368         if (daddr == fnhe->fnhe_daddr) {
1369                 struct rtable __rcu **porig;
1370                 struct rtable *orig;
1371                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1372
1373                 if (rt_is_input_route(rt))
1374                         porig = &fnhe->fnhe_rth_input;
1375                 else
1376                         porig = &fnhe->fnhe_rth_output;
1377                 orig = rcu_dereference(*porig);
1378
1379                 if (fnhe->fnhe_genid != genid) {
1380                         fnhe->fnhe_genid = genid;
1381                         fnhe->fnhe_gw = 0;
1382                         fnhe->fnhe_pmtu = 0;
1383                         fnhe->fnhe_expires = 0;
1384                         fnhe_flush_routes(fnhe);
1385                         orig = NULL;
1386                 }
1387                 fill_route_from_fnhe(rt, fnhe);
1388                 if (!rt->rt_gateway)
1389                         rt->rt_gateway = daddr;
1390
1391                 if (do_cache) {
1392                         dst_hold(&rt->dst);
1393                         rcu_assign_pointer(*porig, rt);
1394                         if (orig) {
1395                                 dst_dev_put(&orig->dst);
1396                                 dst_release(&orig->dst);
1397                         }
1398                         ret = true;
1399                 }
1400
1401                 fnhe->fnhe_stamp = jiffies;
1402         }
1403         spin_unlock_bh(&fnhe_lock);
1404
1405         return ret;
1406 }
1407
1408 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1409 {
1410         struct rtable *orig, *prev, **p;
1411         bool ret = true;
1412
1413         if (rt_is_input_route(rt)) {
1414                 p = (struct rtable **)&nh->nh_rth_input;
1415         } else {
1416                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1417         }
1418         orig = *p;
1419
1420         /* hold dst before doing cmpxchg() to avoid race condition
1421          * on this dst
1422          */
1423         dst_hold(&rt->dst);
1424         prev = cmpxchg(p, orig, rt);
1425         if (prev == orig) {
1426                 if (orig) {
1427                         dst_dev_put(&orig->dst);
1428                         dst_release(&orig->dst);
1429                 }
1430         } else {
1431                 dst_release(&rt->dst);
1432                 ret = false;
1433         }
1434
1435         return ret;
1436 }
1437
1438 struct uncached_list {
1439         spinlock_t              lock;
1440         struct list_head        head;
1441 };
1442
1443 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1444
1445 static void rt_add_uncached_list(struct rtable *rt)
1446 {
1447         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1448
1449         rt->rt_uncached_list = ul;
1450
1451         spin_lock_bh(&ul->lock);
1452         list_add_tail(&rt->rt_uncached, &ul->head);
1453         spin_unlock_bh(&ul->lock);
1454 }
1455
1456 static void ipv4_dst_destroy(struct dst_entry *dst)
1457 {
1458         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1459         struct rtable *rt = (struct rtable *) dst;
1460
1461         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1462                 kfree(p);
1463
1464         if (!list_empty(&rt->rt_uncached)) {
1465                 struct uncached_list *ul = rt->rt_uncached_list;
1466
1467                 spin_lock_bh(&ul->lock);
1468                 list_del(&rt->rt_uncached);
1469                 spin_unlock_bh(&ul->lock);
1470         }
1471 }
1472
1473 void rt_flush_dev(struct net_device *dev)
1474 {
1475         struct net *net = dev_net(dev);
1476         struct rtable *rt;
1477         int cpu;
1478
1479         for_each_possible_cpu(cpu) {
1480                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1481
1482                 spin_lock_bh(&ul->lock);
1483                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1484                         if (rt->dst.dev != dev)
1485                                 continue;
1486                         rt->dst.dev = net->loopback_dev;
1487                         dev_hold(rt->dst.dev);
1488                         dev_put(dev);
1489                 }
1490                 spin_unlock_bh(&ul->lock);
1491         }
1492 }
1493
1494 static bool rt_cache_valid(const struct rtable *rt)
1495 {
1496         return  rt &&
1497                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498                 !rt_is_expired(rt);
1499 }
1500
1501 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1502                            const struct fib_result *res,
1503                            struct fib_nh_exception *fnhe,
1504                            struct fib_info *fi, u16 type, u32 itag,
1505                            const bool do_cache)
1506 {
1507         bool cached = false;
1508
1509         if (fi) {
1510                 struct fib_nh *nh = &FIB_RES_NH(*res);
1511
1512                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1513                         rt->rt_gateway = nh->nh_gw;
1514                         rt->rt_uses_gateway = 1;
1515                 }
1516                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517                 if (fi->fib_metrics != &dst_default_metrics) {
1518                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1519                         refcount_inc(&fi->fib_metrics->refcnt);
1520                 }
1521 #ifdef CONFIG_IP_ROUTE_CLASSID
1522                 rt->dst.tclassid = nh->nh_tclassid;
1523 #endif
1524                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1525                 if (unlikely(fnhe))
1526                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527                 else if (do_cache)
1528                         cached = rt_cache_route(nh, rt);
1529                 if (unlikely(!cached)) {
1530                         /* Routes we intend to cache in nexthop exception or
1531                          * FIB nexthop have the DST_NOCACHE bit clear.
1532                          * However, if we are unsuccessful at storing this
1533                          * route into the cache we really need to set it.
1534                          */
1535                         if (!rt->rt_gateway)
1536                                 rt->rt_gateway = daddr;
1537                         rt_add_uncached_list(rt);
1538                 }
1539         } else
1540                 rt_add_uncached_list(rt);
1541
1542 #ifdef CONFIG_IP_ROUTE_CLASSID
1543 #ifdef CONFIG_IP_MULTIPLE_TABLES
1544         set_class_tag(rt, res->tclassid);
1545 #endif
1546         set_class_tag(rt, itag);
1547 #endif
1548 }
1549
1550 struct rtable *rt_dst_alloc(struct net_device *dev,
1551                             unsigned int flags, u16 type,
1552                             bool nopolicy, bool noxfrm, bool will_cache)
1553 {
1554         struct rtable *rt;
1555
1556         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1557                        (will_cache ? 0 : DST_HOST) |
1558                        (nopolicy ? DST_NOPOLICY : 0) |
1559                        (noxfrm ? DST_NOXFRM : 0));
1560
1561         if (rt) {
1562                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563                 rt->rt_flags = flags;
1564                 rt->rt_type = type;
1565                 rt->rt_is_input = 0;
1566                 rt->rt_iif = 0;
1567                 rt->rt_pmtu = 0;
1568                 rt->rt_mtu_locked = 0;
1569                 rt->rt_gateway = 0;
1570                 rt->rt_uses_gateway = 0;
1571                 rt->rt_table_id = 0;
1572                 INIT_LIST_HEAD(&rt->rt_uncached);
1573
1574                 rt->dst.output = ip_output;
1575                 if (flags & RTCF_LOCAL)
1576                         rt->dst.input = ip_local_deliver;
1577         }
1578
1579         return rt;
1580 }
1581 EXPORT_SYMBOL(rt_dst_alloc);
1582
1583 /* called in rcu_read_lock() section */
1584 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1585                           u8 tos, struct net_device *dev,
1586                           struct in_device *in_dev, u32 *itag)
1587 {
1588         int err;
1589
1590         /* Primary sanity checks. */
1591         if (!in_dev)
1592                 return -EINVAL;
1593
1594         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1595             skb->protocol != htons(ETH_P_IP))
1596                 return -EINVAL;
1597
1598         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1599                 return -EINVAL;
1600
1601         if (ipv4_is_zeronet(saddr)) {
1602                 if (!ipv4_is_local_multicast(daddr))
1603                         return -EINVAL;
1604         } else {
1605                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1606                                           in_dev, itag);
1607                 if (err < 0)
1608                         return err;
1609         }
1610         return 0;
1611 }
1612
1613 /* called in rcu_read_lock() section */
1614 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1615                              u8 tos, struct net_device *dev, int our)
1616 {
1617         struct in_device *in_dev = __in_dev_get_rcu(dev);
1618         unsigned int flags = RTCF_MULTICAST;
1619         struct rtable *rth;
1620         u32 itag = 0;
1621         int err;
1622
1623         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1624         if (err)
1625                 return err;
1626
1627         if (our)
1628                 flags |= RTCF_LOCAL;
1629
1630         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1631                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1632         if (!rth)
1633                 return -ENOBUFS;
1634
1635 #ifdef CONFIG_IP_ROUTE_CLASSID
1636         rth->dst.tclassid = itag;
1637 #endif
1638         rth->dst.output = ip_rt_bug;
1639         rth->rt_is_input= 1;
1640
1641 #ifdef CONFIG_IP_MROUTE
1642         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1643                 rth->dst.input = ip_mr_input;
1644 #endif
1645         RT_CACHE_STAT_INC(in_slow_mc);
1646
1647         skb_dst_set(skb, &rth->dst);
1648         return 0;
1649 }
1650
1651
1652 static void ip_handle_martian_source(struct net_device *dev,
1653                                      struct in_device *in_dev,
1654                                      struct sk_buff *skb,
1655                                      __be32 daddr,
1656                                      __be32 saddr)
1657 {
1658         RT_CACHE_STAT_INC(in_martian_src);
1659 #ifdef CONFIG_IP_ROUTE_VERBOSE
1660         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1661                 /*
1662                  *      RFC1812 recommendation, if source is martian,
1663                  *      the only hint is MAC header.
1664                  */
1665                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1666                         &daddr, &saddr, dev->name);
1667                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1668                         print_hex_dump(KERN_WARNING, "ll header: ",
1669                                        DUMP_PREFIX_OFFSET, 16, 1,
1670                                        skb_mac_header(skb),
1671                                        dev->hard_header_len, true);
1672                 }
1673         }
1674 #endif
1675 }
1676
1677 static void set_lwt_redirect(struct rtable *rth)
1678 {
1679         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1680                 rth->dst.lwtstate->orig_output = rth->dst.output;
1681                 rth->dst.output = lwtunnel_output;
1682         }
1683
1684         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1685                 rth->dst.lwtstate->orig_input = rth->dst.input;
1686                 rth->dst.input = lwtunnel_input;
1687         }
1688 }
1689
1690 /* called in rcu_read_lock() section */
1691 static int __mkroute_input(struct sk_buff *skb,
1692                            const struct fib_result *res,
1693                            struct in_device *in_dev,
1694                            __be32 daddr, __be32 saddr, u32 tos)
1695 {
1696         struct fib_nh_exception *fnhe;
1697         struct rtable *rth;
1698         int err;
1699         struct in_device *out_dev;
1700         bool do_cache;
1701         u32 itag = 0;
1702
1703         /* get a working reference to the output device */
1704         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1705         if (!out_dev) {
1706                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1707                 return -EINVAL;
1708         }
1709
1710         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1711                                   in_dev->dev, in_dev, &itag);
1712         if (err < 0) {
1713                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1714                                          saddr);
1715
1716                 goto cleanup;
1717         }
1718
1719         do_cache = res->fi && !itag;
1720         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1721             skb->protocol == htons(ETH_P_IP) &&
1722             (IN_DEV_SHARED_MEDIA(out_dev) ||
1723              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1724                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1725
1726         if (skb->protocol != htons(ETH_P_IP)) {
1727                 /* Not IP (i.e. ARP). Do not create route, if it is
1728                  * invalid for proxy arp. DNAT routes are always valid.
1729                  *
1730                  * Proxy arp feature have been extended to allow, ARP
1731                  * replies back to the same interface, to support
1732                  * Private VLAN switch technologies. See arp.c.
1733                  */
1734                 if (out_dev == in_dev &&
1735                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1736                         err = -EINVAL;
1737                         goto cleanup;
1738                 }
1739         }
1740
1741         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1742         if (do_cache) {
1743                 if (fnhe)
1744                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1745                 else
1746                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1747                 if (rt_cache_valid(rth)) {
1748                         skb_dst_set_noref(skb, &rth->dst);
1749                         goto out;
1750                 }
1751         }
1752
1753         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1754                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1755                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1756         if (!rth) {
1757                 err = -ENOBUFS;
1758                 goto cleanup;
1759         }
1760
1761         rth->rt_is_input = 1;
1762         if (res->table)
1763                 rth->rt_table_id = res->table->tb_id;
1764         RT_CACHE_STAT_INC(in_slow_tot);
1765
1766         rth->dst.input = ip_forward;
1767
1768         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1769                        do_cache);
1770         set_lwt_redirect(rth);
1771         skb_dst_set(skb, &rth->dst);
1772 out:
1773         err = 0;
1774  cleanup:
1775         return err;
1776 }
1777
1778 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1779 /* To make ICMP packets follow the right flow, the multipath hash is
1780  * calculated from the inner IP addresses.
1781  */
1782 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1783                                  struct flow_keys *hash_keys)
1784 {
1785         const struct iphdr *outer_iph = ip_hdr(skb);
1786         const struct iphdr *inner_iph;
1787         const struct icmphdr *icmph;
1788         struct iphdr _inner_iph;
1789         struct icmphdr _icmph;
1790
1791         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1792         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1793         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1794                 return;
1795
1796         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1797                 return;
1798
1799         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1800                                    &_icmph);
1801         if (!icmph)
1802                 return;
1803
1804         if (icmph->type != ICMP_DEST_UNREACH &&
1805             icmph->type != ICMP_REDIRECT &&
1806             icmph->type != ICMP_TIME_EXCEEDED &&
1807             icmph->type != ICMP_PARAMETERPROB)
1808                 return;
1809
1810         inner_iph = skb_header_pointer(skb,
1811                                        outer_iph->ihl * 4 + sizeof(_icmph),
1812                                        sizeof(_inner_iph), &_inner_iph);
1813         if (!inner_iph)
1814                 return;
1815         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1816         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1817 }
1818
1819 /* if skb is set it will be used and fl4 can be NULL */
1820 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1821                        const struct sk_buff *skb)
1822 {
1823         struct net *net = fi->fib_net;
1824         struct flow_keys hash_keys;
1825         u32 mhash;
1826
1827         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1828         case 0:
1829                 memset(&hash_keys, 0, sizeof(hash_keys));
1830                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1831                 if (skb) {
1832                         ip_multipath_l3_keys(skb, &hash_keys);
1833                 } else {
1834                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1835                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1836                 }
1837                 break;
1838         case 1:
1839                 /* skb is currently provided only when forwarding */
1840                 if (skb) {
1841                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1842                         struct flow_keys keys;
1843
1844                         /* short-circuit if we already have L4 hash present */
1845                         if (skb->l4_hash)
1846                                 return skb_get_hash_raw(skb) >> 1;
1847                         memset(&hash_keys, 0, sizeof(hash_keys));
1848                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1849
1850                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1851                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1852                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1853                         hash_keys.ports.src = keys.ports.src;
1854                         hash_keys.ports.dst = keys.ports.dst;
1855                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1856                 } else {
1857                         memset(&hash_keys, 0, sizeof(hash_keys));
1858                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1859                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1860                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1861                         hash_keys.ports.src = fl4->fl4_sport;
1862                         hash_keys.ports.dst = fl4->fl4_dport;
1863                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1864                 }
1865                 break;
1866         }
1867         mhash = flow_hash_from_keys(&hash_keys);
1868
1869         return mhash >> 1;
1870 }
1871 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1872 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1873
1874 static int ip_mkroute_input(struct sk_buff *skb,
1875                             struct fib_result *res,
1876                             struct in_device *in_dev,
1877                             __be32 daddr, __be32 saddr, u32 tos)
1878 {
1879 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1880         if (res->fi && res->fi->fib_nhs > 1) {
1881                 int h = fib_multipath_hash(res->fi, NULL, skb);
1882
1883                 fib_select_multipath(res, h);
1884         }
1885 #endif
1886
1887         /* create a routing cache entry */
1888         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1889 }
1890
1891 /*
1892  *      NOTE. We drop all the packets that has local source
1893  *      addresses, because every properly looped back packet
1894  *      must have correct destination already attached by output routine.
1895  *
1896  *      Such approach solves two big problems:
1897  *      1. Not simplex devices are handled properly.
1898  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1899  *      called with rcu_read_lock()
1900  */
1901
1902 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1903                                u8 tos, struct net_device *dev,
1904                                struct fib_result *res)
1905 {
1906         struct in_device *in_dev = __in_dev_get_rcu(dev);
1907         struct ip_tunnel_info *tun_info;
1908         struct flowi4   fl4;
1909         unsigned int    flags = 0;
1910         u32             itag = 0;
1911         struct rtable   *rth;
1912         int             err = -EINVAL;
1913         struct net    *net = dev_net(dev);
1914         bool do_cache;
1915
1916         /* IP on this device is disabled. */
1917
1918         if (!in_dev)
1919                 goto out;
1920
1921         /* Check for the most weird martians, which can be not detected
1922            by fib_lookup.
1923          */
1924
1925         tun_info = skb_tunnel_info(skb);
1926         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1927                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1928         else
1929                 fl4.flowi4_tun_key.tun_id = 0;
1930         skb_dst_drop(skb);
1931
1932         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1933                 goto martian_source;
1934
1935         res->fi = NULL;
1936         res->table = NULL;
1937         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1938                 goto brd_input;
1939
1940         /* Accept zero addresses only to limited broadcast;
1941          * I even do not know to fix it or not. Waiting for complains :-)
1942          */
1943         if (ipv4_is_zeronet(saddr))
1944                 goto martian_source;
1945
1946         if (ipv4_is_zeronet(daddr))
1947                 goto martian_destination;
1948
1949         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1950          * and call it once if daddr or/and saddr are loopback addresses
1951          */
1952         if (ipv4_is_loopback(daddr)) {
1953                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1954                         goto martian_destination;
1955         } else if (ipv4_is_loopback(saddr)) {
1956                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1957                         goto martian_source;
1958         }
1959
1960         /*
1961          *      Now we are ready to route packet.
1962          */
1963         fl4.flowi4_oif = 0;
1964         fl4.flowi4_iif = dev->ifindex;
1965         fl4.flowi4_mark = skb->mark;
1966         fl4.flowi4_tos = tos;
1967         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1968         fl4.flowi4_flags = 0;
1969         fl4.daddr = daddr;
1970         fl4.saddr = saddr;
1971         fl4.flowi4_uid = sock_net_uid(net, NULL);
1972         err = fib_lookup(net, &fl4, res, 0);
1973         if (err != 0) {
1974                 if (!IN_DEV_FORWARD(in_dev))
1975                         err = -EHOSTUNREACH;
1976                 goto no_route;
1977         }
1978
1979         if (res->type == RTN_BROADCAST)
1980                 goto brd_input;
1981
1982         if (res->type == RTN_LOCAL) {
1983                 err = fib_validate_source(skb, saddr, daddr, tos,
1984                                           0, dev, in_dev, &itag);
1985                 if (err < 0)
1986                         goto martian_source;
1987                 goto local_input;
1988         }
1989
1990         if (!IN_DEV_FORWARD(in_dev)) {
1991                 err = -EHOSTUNREACH;
1992                 goto no_route;
1993         }
1994         if (res->type != RTN_UNICAST)
1995                 goto martian_destination;
1996
1997         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1998 out:    return err;
1999
2000 brd_input:
2001         if (skb->protocol != htons(ETH_P_IP))
2002                 goto e_inval;
2003
2004         if (!ipv4_is_zeronet(saddr)) {
2005                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006                                           in_dev, &itag);
2007                 if (err < 0)
2008                         goto martian_source;
2009         }
2010         flags |= RTCF_BROADCAST;
2011         res->type = RTN_BROADCAST;
2012         RT_CACHE_STAT_INC(in_brd);
2013
2014 local_input:
2015         do_cache = false;
2016         if (res->fi) {
2017                 if (!itag) {
2018                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019                         if (rt_cache_valid(rth)) {
2020                                 skb_dst_set_noref(skb, &rth->dst);
2021                                 err = 0;
2022                                 goto out;
2023                         }
2024                         do_cache = true;
2025                 }
2026         }
2027
2028         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029                            flags | RTCF_LOCAL, res->type,
2030                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031         if (!rth)
2032                 goto e_nobufs;
2033
2034         rth->dst.output= ip_rt_bug;
2035 #ifdef CONFIG_IP_ROUTE_CLASSID
2036         rth->dst.tclassid = itag;
2037 #endif
2038         rth->rt_is_input = 1;
2039         if (res->table)
2040                 rth->rt_table_id = res->table->tb_id;
2041
2042         RT_CACHE_STAT_INC(in_slow_tot);
2043         if (res->type == RTN_UNREACHABLE) {
2044                 rth->dst.input= ip_error;
2045                 rth->dst.error= -err;
2046                 rth->rt_flags   &= ~RTCF_LOCAL;
2047         }
2048
2049         if (do_cache) {
2050                 struct fib_nh *nh = &FIB_RES_NH(*res);
2051
2052                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2053                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2054                         WARN_ON(rth->dst.input == lwtunnel_input);
2055                         rth->dst.lwtstate->orig_input = rth->dst.input;
2056                         rth->dst.input = lwtunnel_input;
2057                 }
2058
2059                 if (unlikely(!rt_cache_route(nh, rth)))
2060                         rt_add_uncached_list(rth);
2061         }
2062         skb_dst_set(skb, &rth->dst);
2063         err = 0;
2064         goto out;
2065
2066 no_route:
2067         RT_CACHE_STAT_INC(in_no_route);
2068         res->type = RTN_UNREACHABLE;
2069         res->fi = NULL;
2070         res->table = NULL;
2071         goto local_input;
2072
2073         /*
2074          *      Do not cache martian addresses: they should be logged (RFC1812)
2075          */
2076 martian_destination:
2077         RT_CACHE_STAT_INC(in_martian_dst);
2078 #ifdef CONFIG_IP_ROUTE_VERBOSE
2079         if (IN_DEV_LOG_MARTIANS(in_dev))
2080                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2081                                      &daddr, &saddr, dev->name);
2082 #endif
2083
2084 e_inval:
2085         err = -EINVAL;
2086         goto out;
2087
2088 e_nobufs:
2089         err = -ENOBUFS;
2090         goto out;
2091
2092 martian_source:
2093         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2094         goto out;
2095 }
2096
2097 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098                          u8 tos, struct net_device *dev)
2099 {
2100         struct fib_result res;
2101         int err;
2102
2103         tos &= IPTOS_RT_MASK;
2104         rcu_read_lock();
2105         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2106         rcu_read_unlock();
2107
2108         return err;
2109 }
2110 EXPORT_SYMBOL(ip_route_input_noref);
2111
2112 /* called with rcu_read_lock held */
2113 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2114                        u8 tos, struct net_device *dev, struct fib_result *res)
2115 {
2116         /* Multicast recognition logic is moved from route cache to here.
2117            The problem was that too many Ethernet cards have broken/missing
2118            hardware multicast filters :-( As result the host on multicasting
2119            network acquires a lot of useless route cache entries, sort of
2120            SDR messages from all the world. Now we try to get rid of them.
2121            Really, provided software IP multicast filter is organized
2122            reasonably (at least, hashed), it does not result in a slowdown
2123            comparing with route cache reject entries.
2124            Note, that multicast routers are not affected, because
2125            route cache entry is created eventually.
2126          */
2127         if (ipv4_is_multicast(daddr)) {
2128                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2129                 int our = 0;
2130                 int err = -EINVAL;
2131
2132                 if (!in_dev)
2133                         return err;
2134                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2135                                       ip_hdr(skb)->protocol);
2136
2137                 /* check l3 master if no match yet */
2138                 if (!our && netif_is_l3_slave(dev)) {
2139                         struct in_device *l3_in_dev;
2140
2141                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2142                         if (l3_in_dev)
2143                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2144                                                       ip_hdr(skb)->protocol);
2145                 }
2146
2147                 if (our
2148 #ifdef CONFIG_IP_MROUTE
2149                         ||
2150                     (!ipv4_is_local_multicast(daddr) &&
2151                      IN_DEV_MFORWARD(in_dev))
2152 #endif
2153                    ) {
2154                         err = ip_route_input_mc(skb, daddr, saddr,
2155                                                 tos, dev, our);
2156                 }
2157                 return err;
2158         }
2159
2160         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2161 }
2162
2163 /* called with rcu_read_lock() */
2164 static struct rtable *__mkroute_output(const struct fib_result *res,
2165                                        const struct flowi4 *fl4, int orig_oif,
2166                                        struct net_device *dev_out,
2167                                        unsigned int flags)
2168 {
2169         struct fib_info *fi = res->fi;
2170         struct fib_nh_exception *fnhe;
2171         struct in_device *in_dev;
2172         u16 type = res->type;
2173         struct rtable *rth;
2174         bool do_cache;
2175
2176         in_dev = __in_dev_get_rcu(dev_out);
2177         if (!in_dev)
2178                 return ERR_PTR(-EINVAL);
2179
2180         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2181                 if (ipv4_is_loopback(fl4->saddr) &&
2182                     !(dev_out->flags & IFF_LOOPBACK) &&
2183                     !netif_is_l3_master(dev_out))
2184                         return ERR_PTR(-EINVAL);
2185
2186         if (ipv4_is_lbcast(fl4->daddr))
2187                 type = RTN_BROADCAST;
2188         else if (ipv4_is_multicast(fl4->daddr))
2189                 type = RTN_MULTICAST;
2190         else if (ipv4_is_zeronet(fl4->daddr))
2191                 return ERR_PTR(-EINVAL);
2192
2193         if (dev_out->flags & IFF_LOOPBACK)
2194                 flags |= RTCF_LOCAL;
2195
2196         do_cache = true;
2197         if (type == RTN_BROADCAST) {
2198                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2199                 fi = NULL;
2200         } else if (type == RTN_MULTICAST) {
2201                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2202                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2203                                      fl4->flowi4_proto))
2204                         flags &= ~RTCF_LOCAL;
2205                 else
2206                         do_cache = false;
2207                 /* If multicast route do not exist use
2208                  * default one, but do not gateway in this case.
2209                  * Yes, it is hack.
2210                  */
2211                 if (fi && res->prefixlen < 4)
2212                         fi = NULL;
2213         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2214                    (orig_oif != dev_out->ifindex)) {
2215                 /* For local routes that require a particular output interface
2216                  * we do not want to cache the result.  Caching the result
2217                  * causes incorrect behaviour when there are multiple source
2218                  * addresses on the interface, the end result being that if the
2219                  * intended recipient is waiting on that interface for the
2220                  * packet he won't receive it because it will be delivered on
2221                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2222                  * be set to the loopback interface as well.
2223                  */
2224                 do_cache = false;
2225         }
2226
2227         fnhe = NULL;
2228         do_cache &= fi != NULL;
2229         if (fi) {
2230                 struct rtable __rcu **prth;
2231                 struct fib_nh *nh = &FIB_RES_NH(*res);
2232
2233                 fnhe = find_exception(nh, fl4->daddr);
2234                 if (!do_cache)
2235                         goto add;
2236                 if (fnhe) {
2237                         prth = &fnhe->fnhe_rth_output;
2238                 } else {
2239                         if (unlikely(fl4->flowi4_flags &
2240                                      FLOWI_FLAG_KNOWN_NH &&
2241                                      !(nh->nh_gw &&
2242                                        nh->nh_scope == RT_SCOPE_LINK))) {
2243                                 do_cache = false;
2244                                 goto add;
2245                         }
2246                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2247                 }
2248                 rth = rcu_dereference(*prth);
2249                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2250                         return rth;
2251         }
2252
2253 add:
2254         rth = rt_dst_alloc(dev_out, flags, type,
2255                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2256                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2257                            do_cache);
2258         if (!rth)
2259                 return ERR_PTR(-ENOBUFS);
2260
2261         rth->rt_iif = orig_oif;
2262         if (res->table)
2263                 rth->rt_table_id = res->table->tb_id;
2264
2265         RT_CACHE_STAT_INC(out_slow_tot);
2266
2267         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2268                 if (flags & RTCF_LOCAL &&
2269                     !(dev_out->flags & IFF_LOOPBACK)) {
2270                         rth->dst.output = ip_mc_output;
2271                         RT_CACHE_STAT_INC(out_slow_mc);
2272                 }
2273 #ifdef CONFIG_IP_MROUTE
2274                 if (type == RTN_MULTICAST) {
2275                         if (IN_DEV_MFORWARD(in_dev) &&
2276                             !ipv4_is_local_multicast(fl4->daddr)) {
2277                                 rth->dst.input = ip_mr_input;
2278                                 rth->dst.output = ip_mc_output;
2279                         }
2280                 }
2281 #endif
2282         }
2283
2284         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2285         set_lwt_redirect(rth);
2286
2287         return rth;
2288 }
2289
2290 /*
2291  * Major route resolver routine.
2292  */
2293
2294 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2295                                         const struct sk_buff *skb)
2296 {
2297         __u8 tos = RT_FL_TOS(fl4);
2298         struct fib_result res = {
2299                 .type           = RTN_UNSPEC,
2300                 .fi             = NULL,
2301                 .table          = NULL,
2302                 .tclassid       = 0,
2303         };
2304         struct rtable *rth;
2305
2306         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2307         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2308         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2309                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2310
2311         rcu_read_lock();
2312         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2313         rcu_read_unlock();
2314
2315         return rth;
2316 }
2317 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2318
2319 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2320                                             struct fib_result *res,
2321                                             const struct sk_buff *skb)
2322 {
2323         struct net_device *dev_out = NULL;
2324         int orig_oif = fl4->flowi4_oif;
2325         unsigned int flags = 0;
2326         struct rtable *rth;
2327         int err = -ENETUNREACH;
2328
2329         if (fl4->saddr) {
2330                 rth = ERR_PTR(-EINVAL);
2331                 if (ipv4_is_multicast(fl4->saddr) ||
2332                     ipv4_is_lbcast(fl4->saddr) ||
2333                     ipv4_is_zeronet(fl4->saddr))
2334                         goto out;
2335
2336                 /* I removed check for oif == dev_out->oif here.
2337                    It was wrong for two reasons:
2338                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2339                       is assigned to multiple interfaces.
2340                    2. Moreover, we are allowed to send packets with saddr
2341                       of another iface. --ANK
2342                  */
2343
2344                 if (fl4->flowi4_oif == 0 &&
2345                     (ipv4_is_multicast(fl4->daddr) ||
2346                      ipv4_is_lbcast(fl4->daddr))) {
2347                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2348                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2349                         if (!dev_out)
2350                                 goto out;
2351
2352                         /* Special hack: user can direct multicasts
2353                            and limited broadcast via necessary interface
2354                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2355                            This hack is not just for fun, it allows
2356                            vic,vat and friends to work.
2357                            They bind socket to loopback, set ttl to zero
2358                            and expect that it will work.
2359                            From the viewpoint of routing cache they are broken,
2360                            because we are not allowed to build multicast path
2361                            with loopback source addr (look, routing cache
2362                            cannot know, that ttl is zero, so that packet
2363                            will not leave this host and route is valid).
2364                            Luckily, this hack is good workaround.
2365                          */
2366
2367                         fl4->flowi4_oif = dev_out->ifindex;
2368                         goto make_route;
2369                 }
2370
2371                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2372                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2373                         if (!__ip_dev_find(net, fl4->saddr, false))
2374                                 goto out;
2375                 }
2376         }
2377
2378
2379         if (fl4->flowi4_oif) {
2380                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2381                 rth = ERR_PTR(-ENODEV);
2382                 if (!dev_out)
2383                         goto out;
2384
2385                 /* RACE: Check return value of inet_select_addr instead. */
2386                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2387                         rth = ERR_PTR(-ENETUNREACH);
2388                         goto out;
2389                 }
2390                 if (ipv4_is_local_multicast(fl4->daddr) ||
2391                     ipv4_is_lbcast(fl4->daddr) ||
2392                     fl4->flowi4_proto == IPPROTO_IGMP) {
2393                         if (!fl4->saddr)
2394                                 fl4->saddr = inet_select_addr(dev_out, 0,
2395                                                               RT_SCOPE_LINK);
2396                         goto make_route;
2397                 }
2398                 if (!fl4->saddr) {
2399                         if (ipv4_is_multicast(fl4->daddr))
2400                                 fl4->saddr = inet_select_addr(dev_out, 0,
2401                                                               fl4->flowi4_scope);
2402                         else if (!fl4->daddr)
2403                                 fl4->saddr = inet_select_addr(dev_out, 0,
2404                                                               RT_SCOPE_HOST);
2405                 }
2406         }
2407
2408         if (!fl4->daddr) {
2409                 fl4->daddr = fl4->saddr;
2410                 if (!fl4->daddr)
2411                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2412                 dev_out = net->loopback_dev;
2413                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2414                 res->type = RTN_LOCAL;
2415                 flags |= RTCF_LOCAL;
2416                 goto make_route;
2417         }
2418
2419         err = fib_lookup(net, fl4, res, 0);
2420         if (err) {
2421                 res->fi = NULL;
2422                 res->table = NULL;
2423                 if (fl4->flowi4_oif &&
2424                     (ipv4_is_multicast(fl4->daddr) ||
2425                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2426                         /* Apparently, routing tables are wrong. Assume,
2427                            that the destination is on link.
2428
2429                            WHY? DW.
2430                            Because we are allowed to send to iface
2431                            even if it has NO routes and NO assigned
2432                            addresses. When oif is specified, routing
2433                            tables are looked up with only one purpose:
2434                            to catch if destination is gatewayed, rather than
2435                            direct. Moreover, if MSG_DONTROUTE is set,
2436                            we send packet, ignoring both routing tables
2437                            and ifaddr state. --ANK
2438
2439
2440                            We could make it even if oif is unknown,
2441                            likely IPv6, but we do not.
2442                          */
2443
2444                         if (fl4->saddr == 0)
2445                                 fl4->saddr = inet_select_addr(dev_out, 0,
2446                                                               RT_SCOPE_LINK);
2447                         res->type = RTN_UNICAST;
2448                         goto make_route;
2449                 }
2450                 rth = ERR_PTR(err);
2451                 goto out;
2452         }
2453
2454         if (res->type == RTN_LOCAL) {
2455                 if (!fl4->saddr) {
2456                         if (res->fi->fib_prefsrc)
2457                                 fl4->saddr = res->fi->fib_prefsrc;
2458                         else
2459                                 fl4->saddr = fl4->daddr;
2460                 }
2461
2462                 /* L3 master device is the loopback for that domain */
2463                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2464                         net->loopback_dev;
2465
2466                 /* make sure orig_oif points to fib result device even
2467                  * though packet rx/tx happens over loopback or l3mdev
2468                  */
2469                 orig_oif = FIB_RES_OIF(*res);
2470
2471                 fl4->flowi4_oif = dev_out->ifindex;
2472                 flags |= RTCF_LOCAL;
2473                 goto make_route;
2474         }
2475
2476         fib_select_path(net, res, fl4, skb);
2477
2478         dev_out = FIB_RES_DEV(*res);
2479         fl4->flowi4_oif = dev_out->ifindex;
2480
2481
2482 make_route:
2483         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2484
2485 out:
2486         return rth;
2487 }
2488
2489 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2490 {
2491         return NULL;
2492 }
2493
2494 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2495 {
2496         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2497
2498         return mtu ? : dst->dev->mtu;
2499 }
2500
2501 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2502                                           struct sk_buff *skb, u32 mtu)
2503 {
2504 }
2505
2506 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2507                                        struct sk_buff *skb)
2508 {
2509 }
2510
2511 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2512                                           unsigned long old)
2513 {
2514         return NULL;
2515 }
2516
2517 static struct dst_ops ipv4_dst_blackhole_ops = {
2518         .family                 =       AF_INET,
2519         .check                  =       ipv4_blackhole_dst_check,
2520         .mtu                    =       ipv4_blackhole_mtu,
2521         .default_advmss         =       ipv4_default_advmss,
2522         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2523         .redirect               =       ipv4_rt_blackhole_redirect,
2524         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2525         .neigh_lookup           =       ipv4_neigh_lookup,
2526 };
2527
2528 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2529 {
2530         struct rtable *ort = (struct rtable *) dst_orig;
2531         struct rtable *rt;
2532
2533         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2534         if (rt) {
2535                 struct dst_entry *new = &rt->dst;
2536
2537                 new->__use = 1;
2538                 new->input = dst_discard;
2539                 new->output = dst_discard_out;
2540
2541                 new->dev = net->loopback_dev;
2542                 if (new->dev)
2543                         dev_hold(new->dev);
2544
2545                 rt->rt_is_input = ort->rt_is_input;
2546                 rt->rt_iif = ort->rt_iif;
2547                 rt->rt_pmtu = ort->rt_pmtu;
2548                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2549
2550                 rt->rt_genid = rt_genid_ipv4(net);
2551                 rt->rt_flags = ort->rt_flags;
2552                 rt->rt_type = ort->rt_type;
2553                 rt->rt_gateway = ort->rt_gateway;
2554                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2555
2556                 INIT_LIST_HEAD(&rt->rt_uncached);
2557         }
2558
2559         dst_release(dst_orig);
2560
2561         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2562 }
2563
2564 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2565                                     const struct sock *sk)
2566 {
2567         struct rtable *rt = __ip_route_output_key(net, flp4);
2568
2569         if (IS_ERR(rt))
2570                 return rt;
2571
2572         if (flp4->flowi4_proto)
2573                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2574                                                         flowi4_to_flowi(flp4),
2575                                                         sk, 0);
2576
2577         return rt;
2578 }
2579 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2580
2581 /* called with rcu_read_lock held */
2582 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2583                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2584                         u32 seq)
2585 {
2586         struct rtable *rt = skb_rtable(skb);
2587         struct rtmsg *r;
2588         struct nlmsghdr *nlh;
2589         unsigned long expires = 0;
2590         u32 error;
2591         u32 metrics[RTAX_MAX];
2592
2593         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2594         if (!nlh)
2595                 return -EMSGSIZE;
2596
2597         r = nlmsg_data(nlh);
2598         r->rtm_family    = AF_INET;
2599         r->rtm_dst_len  = 32;
2600         r->rtm_src_len  = 0;
2601         r->rtm_tos      = fl4->flowi4_tos;
2602         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2603         if (nla_put_u32(skb, RTA_TABLE, table_id))
2604                 goto nla_put_failure;
2605         r->rtm_type     = rt->rt_type;
2606         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2607         r->rtm_protocol = RTPROT_UNSPEC;
2608         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2609         if (rt->rt_flags & RTCF_NOTIFY)
2610                 r->rtm_flags |= RTM_F_NOTIFY;
2611         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2612                 r->rtm_flags |= RTCF_DOREDIRECT;
2613
2614         if (nla_put_in_addr(skb, RTA_DST, dst))
2615                 goto nla_put_failure;
2616         if (src) {
2617                 r->rtm_src_len = 32;
2618                 if (nla_put_in_addr(skb, RTA_SRC, src))
2619                         goto nla_put_failure;
2620         }
2621         if (rt->dst.dev &&
2622             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2623                 goto nla_put_failure;
2624 #ifdef CONFIG_IP_ROUTE_CLASSID
2625         if (rt->dst.tclassid &&
2626             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2627                 goto nla_put_failure;
2628 #endif
2629         if (!rt_is_input_route(rt) &&
2630             fl4->saddr != src) {
2631                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2632                         goto nla_put_failure;
2633         }
2634         if (rt->rt_uses_gateway &&
2635             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2636                 goto nla_put_failure;
2637
2638         expires = rt->dst.expires;
2639         if (expires) {
2640                 unsigned long now = jiffies;
2641
2642                 if (time_before(now, expires))
2643                         expires -= now;
2644                 else
2645                         expires = 0;
2646         }
2647
2648         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2649         if (rt->rt_pmtu && expires)
2650                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2651         if (rt->rt_mtu_locked && expires)
2652                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2653         if (rtnetlink_put_metrics(skb, metrics) < 0)
2654                 goto nla_put_failure;
2655
2656         if (fl4->flowi4_mark &&
2657             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2658                 goto nla_put_failure;
2659
2660         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2661             nla_put_u32(skb, RTA_UID,
2662                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2663                 goto nla_put_failure;
2664
2665         error = rt->dst.error;
2666
2667         if (rt_is_input_route(rt)) {
2668 #ifdef CONFIG_IP_MROUTE
2669                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2670                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2671                         int err = ipmr_get_route(net, skb,
2672                                                  fl4->saddr, fl4->daddr,
2673                                                  r, portid);
2674
2675                         if (err <= 0) {
2676                                 if (err == 0)
2677                                         return 0;
2678                                 goto nla_put_failure;
2679                         }
2680                 } else
2681 #endif
2682                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2683                                 goto nla_put_failure;
2684         }
2685
2686         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2687                 goto nla_put_failure;
2688
2689         nlmsg_end(skb, nlh);
2690         return 0;
2691
2692 nla_put_failure:
2693         nlmsg_cancel(skb, nlh);
2694         return -EMSGSIZE;
2695 }
2696
2697 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2698                              struct netlink_ext_ack *extack)
2699 {
2700         struct net *net = sock_net(in_skb->sk);
2701         struct rtmsg *rtm;
2702         struct nlattr *tb[RTA_MAX+1];
2703         struct fib_result res = {};
2704         struct rtable *rt = NULL;
2705         struct flowi4 fl4;
2706         __be32 dst = 0;
2707         __be32 src = 0;
2708         u32 iif;
2709         int err;
2710         int mark;
2711         struct sk_buff *skb;
2712         u32 table_id = RT_TABLE_MAIN;
2713         kuid_t uid;
2714
2715         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2716                           extack);
2717         if (err < 0)
2718                 goto errout;
2719
2720         rtm = nlmsg_data(nlh);
2721
2722         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2723         if (!skb) {
2724                 err = -ENOBUFS;
2725                 goto errout;
2726         }
2727
2728         /* Reserve room for dummy headers, this skb can pass
2729            through good chunk of routing engine.
2730          */
2731         skb_reset_mac_header(skb);
2732         skb_reset_network_header(skb);
2733
2734         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2735         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2736         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2737         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2738         if (tb[RTA_UID])
2739                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2740         else
2741                 uid = (iif ? INVALID_UID : current_uid());
2742
2743         /* Bugfix: need to give ip_route_input enough of an IP header to
2744          * not gag.
2745          */
2746         ip_hdr(skb)->protocol = IPPROTO_UDP;
2747         ip_hdr(skb)->saddr = src;
2748         ip_hdr(skb)->daddr = dst;
2749
2750         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2751
2752         memset(&fl4, 0, sizeof(fl4));
2753         fl4.daddr = dst;
2754         fl4.saddr = src;
2755         fl4.flowi4_tos = rtm->rtm_tos;
2756         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2757         fl4.flowi4_mark = mark;
2758         fl4.flowi4_uid = uid;
2759
2760         rcu_read_lock();
2761
2762         if (iif) {
2763                 struct net_device *dev;
2764
2765                 dev = dev_get_by_index_rcu(net, iif);
2766                 if (!dev) {
2767                         err = -ENODEV;
2768                         goto errout_free;
2769                 }
2770
2771                 skb->protocol   = htons(ETH_P_IP);
2772                 skb->dev        = dev;
2773                 skb->mark       = mark;
2774                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2775                                          dev, &res);
2776
2777                 rt = skb_rtable(skb);
2778                 if (err == 0 && rt->dst.error)
2779                         err = -rt->dst.error;
2780         } else {
2781                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2782                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2783                 err = 0;
2784                 if (IS_ERR(rt))
2785                         err = PTR_ERR(rt);
2786                 else
2787                         skb_dst_set(skb, &rt->dst);
2788         }
2789
2790         if (err)
2791                 goto errout_free;
2792
2793         if (rtm->rtm_flags & RTM_F_NOTIFY)
2794                 rt->rt_flags |= RTCF_NOTIFY;
2795
2796         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2797                 table_id = rt->rt_table_id;
2798
2799         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2800                 if (!res.fi) {
2801                         err = fib_props[res.type].error;
2802                         if (!err)
2803                                 err = -EHOSTUNREACH;
2804                         goto errout_free;
2805                 }
2806                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2807                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2808                                     rt->rt_type, res.prefix, res.prefixlen,
2809                                     fl4.flowi4_tos, res.fi, 0);
2810         } else {
2811                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2812                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2813         }
2814         if (err < 0)
2815                 goto errout_free;
2816
2817         rcu_read_unlock();
2818
2819         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2820 errout:
2821         return err;
2822
2823 errout_free:
2824         rcu_read_unlock();
2825         kfree_skb(skb);
2826         goto errout;
2827 }
2828
2829 void ip_rt_multicast_event(struct in_device *in_dev)
2830 {
2831         rt_cache_flush(dev_net(in_dev->dev));
2832 }
2833
2834 #ifdef CONFIG_SYSCTL
2835 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2836 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2837 static int ip_rt_gc_elasticity __read_mostly    = 8;
2838
2839 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2840                                         void __user *buffer,
2841                                         size_t *lenp, loff_t *ppos)
2842 {
2843         struct net *net = (struct net *)__ctl->extra1;
2844
2845         if (write) {
2846                 rt_cache_flush(net);
2847                 fnhe_genid_bump(net);
2848                 return 0;
2849         }
2850
2851         return -EINVAL;
2852 }
2853
2854 static struct ctl_table ipv4_route_table[] = {
2855         {
2856                 .procname       = "gc_thresh",
2857                 .data           = &ipv4_dst_ops.gc_thresh,
2858                 .maxlen         = sizeof(int),
2859                 .mode           = 0644,
2860                 .proc_handler   = proc_dointvec,
2861         },
2862         {
2863                 .procname       = "max_size",
2864                 .data           = &ip_rt_max_size,
2865                 .maxlen         = sizeof(int),
2866                 .mode           = 0644,
2867                 .proc_handler   = proc_dointvec,
2868         },
2869         {
2870                 /*  Deprecated. Use gc_min_interval_ms */
2871
2872                 .procname       = "gc_min_interval",
2873                 .data           = &ip_rt_gc_min_interval,
2874                 .maxlen         = sizeof(int),
2875                 .mode           = 0644,
2876                 .proc_handler   = proc_dointvec_jiffies,
2877         },
2878         {
2879                 .procname       = "gc_min_interval_ms",
2880                 .data           = &ip_rt_gc_min_interval,
2881                 .maxlen         = sizeof(int),
2882                 .mode           = 0644,
2883                 .proc_handler   = proc_dointvec_ms_jiffies,
2884         },
2885         {
2886                 .procname       = "gc_timeout",
2887                 .data           = &ip_rt_gc_timeout,
2888                 .maxlen         = sizeof(int),
2889                 .mode           = 0644,
2890                 .proc_handler   = proc_dointvec_jiffies,
2891         },
2892         {
2893                 .procname       = "gc_interval",
2894                 .data           = &ip_rt_gc_interval,
2895                 .maxlen         = sizeof(int),
2896                 .mode           = 0644,
2897                 .proc_handler   = proc_dointvec_jiffies,
2898         },
2899         {
2900                 .procname       = "redirect_load",
2901                 .data           = &ip_rt_redirect_load,
2902                 .maxlen         = sizeof(int),
2903                 .mode           = 0644,
2904                 .proc_handler   = proc_dointvec,
2905         },
2906         {
2907                 .procname       = "redirect_number",
2908                 .data           = &ip_rt_redirect_number,
2909                 .maxlen         = sizeof(int),
2910                 .mode           = 0644,
2911                 .proc_handler   = proc_dointvec,
2912         },
2913         {
2914                 .procname       = "redirect_silence",
2915                 .data           = &ip_rt_redirect_silence,
2916                 .maxlen         = sizeof(int),
2917                 .mode           = 0644,
2918                 .proc_handler   = proc_dointvec,
2919         },
2920         {
2921                 .procname       = "error_cost",
2922                 .data           = &ip_rt_error_cost,
2923                 .maxlen         = sizeof(int),
2924                 .mode           = 0644,
2925                 .proc_handler   = proc_dointvec,
2926         },
2927         {
2928                 .procname       = "error_burst",
2929                 .data           = &ip_rt_error_burst,
2930                 .maxlen         = sizeof(int),
2931                 .mode           = 0644,
2932                 .proc_handler   = proc_dointvec,
2933         },
2934         {
2935                 .procname       = "gc_elasticity",
2936                 .data           = &ip_rt_gc_elasticity,
2937                 .maxlen         = sizeof(int),
2938                 .mode           = 0644,
2939                 .proc_handler   = proc_dointvec,
2940         },
2941         {
2942                 .procname       = "mtu_expires",
2943                 .data           = &ip_rt_mtu_expires,
2944                 .maxlen         = sizeof(int),
2945                 .mode           = 0644,
2946                 .proc_handler   = proc_dointvec_jiffies,
2947         },
2948         {
2949                 .procname       = "min_pmtu",
2950                 .data           = &ip_rt_min_pmtu,
2951                 .maxlen         = sizeof(int),
2952                 .mode           = 0644,
2953                 .proc_handler   = proc_dointvec_minmax,
2954                 .extra1         = &ip_min_valid_pmtu,
2955         },
2956         {
2957                 .procname       = "min_adv_mss",
2958                 .data           = &ip_rt_min_advmss,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = proc_dointvec,
2962         },
2963         { }
2964 };
2965
2966 static struct ctl_table ipv4_route_flush_table[] = {
2967         {
2968                 .procname       = "flush",
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0200,
2971                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2972         },
2973         { },
2974 };
2975
2976 static __net_init int sysctl_route_net_init(struct net *net)
2977 {
2978         struct ctl_table *tbl;
2979
2980         tbl = ipv4_route_flush_table;
2981         if (!net_eq(net, &init_net)) {
2982                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2983                 if (!tbl)
2984                         goto err_dup;
2985
2986                 /* Don't export sysctls to unprivileged users */
2987                 if (net->user_ns != &init_user_ns)
2988                         tbl[0].procname = NULL;
2989         }
2990         tbl[0].extra1 = net;
2991
2992         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2993         if (!net->ipv4.route_hdr)
2994                 goto err_reg;
2995         return 0;
2996
2997 err_reg:
2998         if (tbl != ipv4_route_flush_table)
2999                 kfree(tbl);
3000 err_dup:
3001         return -ENOMEM;
3002 }
3003
3004 static __net_exit void sysctl_route_net_exit(struct net *net)
3005 {
3006         struct ctl_table *tbl;
3007
3008         tbl = net->ipv4.route_hdr->ctl_table_arg;
3009         unregister_net_sysctl_table(net->ipv4.route_hdr);
3010         BUG_ON(tbl == ipv4_route_flush_table);
3011         kfree(tbl);
3012 }
3013
3014 static __net_initdata struct pernet_operations sysctl_route_ops = {
3015         .init = sysctl_route_net_init,
3016         .exit = sysctl_route_net_exit,
3017 };
3018 #endif
3019
3020 static __net_init int rt_genid_init(struct net *net)
3021 {
3022         atomic_set(&net->ipv4.rt_genid, 0);
3023         atomic_set(&net->fnhe_genid, 0);
3024         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3025         return 0;
3026 }
3027
3028 static __net_initdata struct pernet_operations rt_genid_ops = {
3029         .init = rt_genid_init,
3030 };
3031
3032 static int __net_init ipv4_inetpeer_init(struct net *net)
3033 {
3034         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3035
3036         if (!bp)
3037                 return -ENOMEM;
3038         inet_peer_base_init(bp);
3039         net->ipv4.peers = bp;
3040         return 0;
3041 }
3042
3043 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3044 {
3045         struct inet_peer_base *bp = net->ipv4.peers;
3046
3047         net->ipv4.peers = NULL;
3048         inetpeer_invalidate_tree(bp);
3049         kfree(bp);
3050 }
3051
3052 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3053         .init   =       ipv4_inetpeer_init,
3054         .exit   =       ipv4_inetpeer_exit,
3055 };
3056
3057 #ifdef CONFIG_IP_ROUTE_CLASSID
3058 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3059 #endif /* CONFIG_IP_ROUTE_CLASSID */
3060
3061 int __init ip_rt_init(void)
3062 {
3063         int rc = 0;
3064         int cpu;
3065
3066         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3067         if (!ip_idents)
3068                 panic("IP: failed to allocate ip_idents\n");
3069
3070         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3071
3072         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3073         if (!ip_tstamps)
3074                 panic("IP: failed to allocate ip_tstamps\n");
3075
3076         for_each_possible_cpu(cpu) {
3077                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3078
3079                 INIT_LIST_HEAD(&ul->head);
3080                 spin_lock_init(&ul->lock);
3081         }
3082 #ifdef CONFIG_IP_ROUTE_CLASSID
3083         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3084         if (!ip_rt_acct)
3085                 panic("IP: failed to allocate ip_rt_acct\n");
3086 #endif
3087
3088         ipv4_dst_ops.kmem_cachep =
3089                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3090                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3091
3092         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3093
3094         if (dst_entries_init(&ipv4_dst_ops) < 0)
3095                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3096
3097         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3098                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3099
3100         ipv4_dst_ops.gc_thresh = ~0;
3101         ip_rt_max_size = INT_MAX;
3102
3103         devinet_init();
3104         ip_fib_init();
3105
3106         if (ip_rt_proc_init())
3107                 pr_err("Unable to create route proc files\n");
3108 #ifdef CONFIG_XFRM
3109         xfrm_init();
3110         xfrm4_init();
3111 #endif
3112         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3113                       RTNL_FLAG_DOIT_UNLOCKED);
3114
3115 #ifdef CONFIG_SYSCTL
3116         register_pernet_subsys(&sysctl_route_ops);
3117 #endif
3118         register_pernet_subsys(&rt_genid_ops);
3119         register_pernet_subsys(&ipv4_inetpeer_ops);
3120         return rc;
3121 }
3122
3123 #ifdef CONFIG_SYSCTL
3124 /*
3125  * We really need to sanitize the damn ipv4 init order, then all
3126  * this nonsense will go away.
3127  */
3128 void __init ip_static_sysctl_init(void)
3129 {
3130         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3131 }
3132 #endif