Merge commit 'gcl/next' into next
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / ipmr.c
1 /*
2 * IP multicast routing support for mrouted 3.6/3.8
3 *
4 * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5 * Linux Consultancy and Custom Driver Development
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Fixes:
13 * Michael Chastain : Incorrect size of copying.
14 * Alan Cox : Added the cache manager code
15 * Alan Cox : Fixed the clone/copy bug and device race.
16 * Mike McLagan : Routing by source
17 * Malcolm Beattie : Buffer handling fixes.
18 * Alexey Kuznetsov : Double buffer free and other fixes.
19 * SVR Anand : Fixed several multicast bugs and problems.
20 * Alexey Kuznetsov : Status, optimisations and more.
21 * Brad Parker : Better behaviour on mrouted upcall
22 * overflow.
23 * Carlos Picoto : PIMv1 Support
24 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
25 * Relax this requrement to work with older peers.
26 *
27 */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 /* Big lock, protecting vif table, mrt cache and mroute socket state.
71 Note that the changes are semaphored via rtnl_lock.
72 */
73
74 static DEFINE_RWLOCK(mrt_lock);
75
76 /*
77 * Multicast router control variables
78 */
79
80 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
81
82 static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
83
84 /* Special spinlock for queue of unresolved entries */
85 static DEFINE_SPINLOCK(mfc_unres_lock);
86
87 /* We return to original Alan's scheme. Hash table of resolved
88 entries is changed only in process context and protected
89 with weak lock mrt_lock. Queue of unresolved entries is protected
90 with strong spinlock mfc_unres_lock.
91
92 In this case data path is free of exclusive locks at all.
93 */
94
95 static struct kmem_cache *mrt_cachep __read_mostly;
96
97 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
98 static int ipmr_cache_report(struct net *net,
99 struct sk_buff *pkt, vifi_t vifi, int assert);
100 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
101
102 static struct timer_list ipmr_expire_timer;
103
104 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
105
106 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
107 {
108 struct net *net = dev_net(dev);
109
110 dev_close(dev);
111
112 dev = __dev_get_by_name(net, "tunl0");
113 if (dev) {
114 const struct net_device_ops *ops = dev->netdev_ops;
115 struct ifreq ifr;
116 struct ip_tunnel_parm p;
117
118 memset(&p, 0, sizeof(p));
119 p.iph.daddr = v->vifc_rmt_addr.s_addr;
120 p.iph.saddr = v->vifc_lcl_addr.s_addr;
121 p.iph.version = 4;
122 p.iph.ihl = 5;
123 p.iph.protocol = IPPROTO_IPIP;
124 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
125 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
126
127 if (ops->ndo_do_ioctl) {
128 mm_segment_t oldfs = get_fs();
129
130 set_fs(KERNEL_DS);
131 ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
132 set_fs(oldfs);
133 }
134 }
135 }
136
137 static
138 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
139 {
140 struct net_device *dev;
141
142 dev = __dev_get_by_name(net, "tunl0");
143
144 if (dev) {
145 const struct net_device_ops *ops = dev->netdev_ops;
146 int err;
147 struct ifreq ifr;
148 struct ip_tunnel_parm p;
149 struct in_device *in_dev;
150
151 memset(&p, 0, sizeof(p));
152 p.iph.daddr = v->vifc_rmt_addr.s_addr;
153 p.iph.saddr = v->vifc_lcl_addr.s_addr;
154 p.iph.version = 4;
155 p.iph.ihl = 5;
156 p.iph.protocol = IPPROTO_IPIP;
157 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
158 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
159
160 if (ops->ndo_do_ioctl) {
161 mm_segment_t oldfs = get_fs();
162
163 set_fs(KERNEL_DS);
164 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
165 set_fs(oldfs);
166 } else
167 err = -EOPNOTSUPP;
168
169 dev = NULL;
170
171 if (err == 0 &&
172 (dev = __dev_get_by_name(net, p.name)) != NULL) {
173 dev->flags |= IFF_MULTICAST;
174
175 in_dev = __in_dev_get_rtnl(dev);
176 if (in_dev == NULL)
177 goto failure;
178
179 ipv4_devconf_setall(in_dev);
180 IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
181
182 if (dev_open(dev))
183 goto failure;
184 dev_hold(dev);
185 }
186 }
187 return dev;
188
189 failure:
190 /* allow the register to be completed before unregistering. */
191 rtnl_unlock();
192 rtnl_lock();
193
194 unregister_netdevice(dev);
195 return NULL;
196 }
197
198 #ifdef CONFIG_IP_PIMSM
199
200 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
201 {
202 struct net *net = dev_net(dev);
203
204 read_lock(&mrt_lock);
205 dev->stats.tx_bytes += skb->len;
206 dev->stats.tx_packets++;
207 ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num,
208 IGMPMSG_WHOLEPKT);
209 read_unlock(&mrt_lock);
210 kfree_skb(skb);
211 return NETDEV_TX_OK;
212 }
213
214 static const struct net_device_ops reg_vif_netdev_ops = {
215 .ndo_start_xmit = reg_vif_xmit,
216 };
217
218 static void reg_vif_setup(struct net_device *dev)
219 {
220 dev->type = ARPHRD_PIMREG;
221 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
222 dev->flags = IFF_NOARP;
223 dev->netdev_ops = &reg_vif_netdev_ops,
224 dev->destructor = free_netdev;
225 dev->features |= NETIF_F_NETNS_LOCAL;
226 }
227
228 static struct net_device *ipmr_reg_vif(struct net *net)
229 {
230 struct net_device *dev;
231 struct in_device *in_dev;
232
233 dev = alloc_netdev(0, "pimreg", reg_vif_setup);
234
235 if (dev == NULL)
236 return NULL;
237
238 dev_net_set(dev, net);
239
240 if (register_netdevice(dev)) {
241 free_netdev(dev);
242 return NULL;
243 }
244 dev->iflink = 0;
245
246 rcu_read_lock();
247 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
248 rcu_read_unlock();
249 goto failure;
250 }
251
252 ipv4_devconf_setall(in_dev);
253 IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
254 rcu_read_unlock();
255
256 if (dev_open(dev))
257 goto failure;
258
259 dev_hold(dev);
260
261 return dev;
262
263 failure:
264 /* allow the register to be completed before unregistering. */
265 rtnl_unlock();
266 rtnl_lock();
267
268 unregister_netdevice(dev);
269 return NULL;
270 }
271 #endif
272
273 /*
274 * Delete a VIF entry
275 * @notify: Set to 1, if the caller is a notifier_call
276 */
277
278 static int vif_delete(struct net *net, int vifi, int notify)
279 {
280 struct vif_device *v;
281 struct net_device *dev;
282 struct in_device *in_dev;
283
284 if (vifi < 0 || vifi >= net->ipv4.maxvif)
285 return -EADDRNOTAVAIL;
286
287 v = &net->ipv4.vif_table[vifi];
288
289 write_lock_bh(&mrt_lock);
290 dev = v->dev;
291 v->dev = NULL;
292
293 if (!dev) {
294 write_unlock_bh(&mrt_lock);
295 return -EADDRNOTAVAIL;
296 }
297
298 #ifdef CONFIG_IP_PIMSM
299 if (vifi == net->ipv4.mroute_reg_vif_num)
300 net->ipv4.mroute_reg_vif_num = -1;
301 #endif
302
303 if (vifi+1 == net->ipv4.maxvif) {
304 int tmp;
305 for (tmp=vifi-1; tmp>=0; tmp--) {
306 if (VIF_EXISTS(net, tmp))
307 break;
308 }
309 net->ipv4.maxvif = tmp+1;
310 }
311
312 write_unlock_bh(&mrt_lock);
313
314 dev_set_allmulti(dev, -1);
315
316 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
317 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
318 ip_rt_multicast_event(in_dev);
319 }
320
321 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
322 unregister_netdevice(dev);
323
324 dev_put(dev);
325 return 0;
326 }
327
328 static inline void ipmr_cache_free(struct mfc_cache *c)
329 {
330 release_net(mfc_net(c));
331 kmem_cache_free(mrt_cachep, c);
332 }
333
334 /* Destroy an unresolved cache entry, killing queued skbs
335 and reporting error to netlink readers.
336 */
337
338 static void ipmr_destroy_unres(struct mfc_cache *c)
339 {
340 struct sk_buff *skb;
341 struct nlmsgerr *e;
342 struct net *net = mfc_net(c);
343
344 atomic_dec(&net->ipv4.cache_resolve_queue_len);
345
346 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
347 if (ip_hdr(skb)->version == 0) {
348 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
349 nlh->nlmsg_type = NLMSG_ERROR;
350 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
351 skb_trim(skb, nlh->nlmsg_len);
352 e = NLMSG_DATA(nlh);
353 e->error = -ETIMEDOUT;
354 memset(&e->msg, 0, sizeof(e->msg));
355
356 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
357 } else
358 kfree_skb(skb);
359 }
360
361 ipmr_cache_free(c);
362 }
363
364
365 /* Single timer process for all the unresolved queue. */
366
367 static void ipmr_expire_process(unsigned long dummy)
368 {
369 unsigned long now;
370 unsigned long expires;
371 struct mfc_cache *c, **cp;
372
373 if (!spin_trylock(&mfc_unres_lock)) {
374 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
375 return;
376 }
377
378 if (mfc_unres_queue == NULL)
379 goto out;
380
381 now = jiffies;
382 expires = 10*HZ;
383 cp = &mfc_unres_queue;
384
385 while ((c=*cp) != NULL) {
386 if (time_after(c->mfc_un.unres.expires, now)) {
387 unsigned long interval = c->mfc_un.unres.expires - now;
388 if (interval < expires)
389 expires = interval;
390 cp = &c->next;
391 continue;
392 }
393
394 *cp = c->next;
395
396 ipmr_destroy_unres(c);
397 }
398
399 if (mfc_unres_queue != NULL)
400 mod_timer(&ipmr_expire_timer, jiffies + expires);
401
402 out:
403 spin_unlock(&mfc_unres_lock);
404 }
405
406 /* Fill oifs list. It is called under write locked mrt_lock. */
407
408 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
409 {
410 int vifi;
411 struct net *net = mfc_net(cache);
412
413 cache->mfc_un.res.minvif = MAXVIFS;
414 cache->mfc_un.res.maxvif = 0;
415 memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
416
417 for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) {
418 if (VIF_EXISTS(net, vifi) &&
419 ttls[vifi] && ttls[vifi] < 255) {
420 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
421 if (cache->mfc_un.res.minvif > vifi)
422 cache->mfc_un.res.minvif = vifi;
423 if (cache->mfc_un.res.maxvif <= vifi)
424 cache->mfc_un.res.maxvif = vifi + 1;
425 }
426 }
427 }
428
429 static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
430 {
431 int vifi = vifc->vifc_vifi;
432 struct vif_device *v = &net->ipv4.vif_table[vifi];
433 struct net_device *dev;
434 struct in_device *in_dev;
435 int err;
436
437 /* Is vif busy ? */
438 if (VIF_EXISTS(net, vifi))
439 return -EADDRINUSE;
440
441 switch (vifc->vifc_flags) {
442 #ifdef CONFIG_IP_PIMSM
443 case VIFF_REGISTER:
444 /*
445 * Special Purpose VIF in PIM
446 * All the packets will be sent to the daemon
447 */
448 if (net->ipv4.mroute_reg_vif_num >= 0)
449 return -EADDRINUSE;
450 dev = ipmr_reg_vif(net);
451 if (!dev)
452 return -ENOBUFS;
453 err = dev_set_allmulti(dev, 1);
454 if (err) {
455 unregister_netdevice(dev);
456 dev_put(dev);
457 return err;
458 }
459 break;
460 #endif
461 case VIFF_TUNNEL:
462 dev = ipmr_new_tunnel(net, vifc);
463 if (!dev)
464 return -ENOBUFS;
465 err = dev_set_allmulti(dev, 1);
466 if (err) {
467 ipmr_del_tunnel(dev, vifc);
468 dev_put(dev);
469 return err;
470 }
471 break;
472 case 0:
473 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
474 if (!dev)
475 return -EADDRNOTAVAIL;
476 err = dev_set_allmulti(dev, 1);
477 if (err) {
478 dev_put(dev);
479 return err;
480 }
481 break;
482 default:
483 return -EINVAL;
484 }
485
486 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
487 dev_put(dev);
488 return -EADDRNOTAVAIL;
489 }
490 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
491 ip_rt_multicast_event(in_dev);
492
493 /*
494 * Fill in the VIF structures
495 */
496 v->rate_limit = vifc->vifc_rate_limit;
497 v->local = vifc->vifc_lcl_addr.s_addr;
498 v->remote = vifc->vifc_rmt_addr.s_addr;
499 v->flags = vifc->vifc_flags;
500 if (!mrtsock)
501 v->flags |= VIFF_STATIC;
502 v->threshold = vifc->vifc_threshold;
503 v->bytes_in = 0;
504 v->bytes_out = 0;
505 v->pkt_in = 0;
506 v->pkt_out = 0;
507 v->link = dev->ifindex;
508 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
509 v->link = dev->iflink;
510
511 /* And finish update writing critical data */
512 write_lock_bh(&mrt_lock);
513 v->dev = dev;
514 #ifdef CONFIG_IP_PIMSM
515 if (v->flags&VIFF_REGISTER)
516 net->ipv4.mroute_reg_vif_num = vifi;
517 #endif
518 if (vifi+1 > net->ipv4.maxvif)
519 net->ipv4.maxvif = vifi+1;
520 write_unlock_bh(&mrt_lock);
521 return 0;
522 }
523
524 static struct mfc_cache *ipmr_cache_find(struct net *net,
525 __be32 origin,
526 __be32 mcastgrp)
527 {
528 int line = MFC_HASH(mcastgrp, origin);
529 struct mfc_cache *c;
530
531 for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) {
532 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
533 break;
534 }
535 return c;
536 }
537
538 /*
539 * Allocate a multicast cache entry
540 */
541 static struct mfc_cache *ipmr_cache_alloc(struct net *net)
542 {
543 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
544 if (c == NULL)
545 return NULL;
546 c->mfc_un.res.minvif = MAXVIFS;
547 mfc_net_set(c, net);
548 return c;
549 }
550
551 static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
552 {
553 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
554 if (c == NULL)
555 return NULL;
556 skb_queue_head_init(&c->mfc_un.unres.unresolved);
557 c->mfc_un.unres.expires = jiffies + 10*HZ;
558 mfc_net_set(c, net);
559 return c;
560 }
561
562 /*
563 * A cache entry has gone into a resolved state from queued
564 */
565
566 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
567 {
568 struct sk_buff *skb;
569 struct nlmsgerr *e;
570
571 /*
572 * Play the pending entries through our router
573 */
574
575 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
576 if (ip_hdr(skb)->version == 0) {
577 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
578
579 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
580 nlh->nlmsg_len = (skb_tail_pointer(skb) -
581 (u8 *)nlh);
582 } else {
583 nlh->nlmsg_type = NLMSG_ERROR;
584 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
585 skb_trim(skb, nlh->nlmsg_len);
586 e = NLMSG_DATA(nlh);
587 e->error = -EMSGSIZE;
588 memset(&e->msg, 0, sizeof(e->msg));
589 }
590
591 rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid);
592 } else
593 ip_mr_forward(skb, c, 0);
594 }
595 }
596
597 /*
598 * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
599 * expects the following bizarre scheme.
600 *
601 * Called under mrt_lock.
602 */
603
604 static int ipmr_cache_report(struct net *net,
605 struct sk_buff *pkt, vifi_t vifi, int assert)
606 {
607 struct sk_buff *skb;
608 const int ihl = ip_hdrlen(pkt);
609 struct igmphdr *igmp;
610 struct igmpmsg *msg;
611 int ret;
612
613 #ifdef CONFIG_IP_PIMSM
614 if (assert == IGMPMSG_WHOLEPKT)
615 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
616 else
617 #endif
618 skb = alloc_skb(128, GFP_ATOMIC);
619
620 if (!skb)
621 return -ENOBUFS;
622
623 #ifdef CONFIG_IP_PIMSM
624 if (assert == IGMPMSG_WHOLEPKT) {
625 /* Ugly, but we have no choice with this interface.
626 Duplicate old header, fix ihl, length etc.
627 And all this only to mangle msg->im_msgtype and
628 to set msg->im_mbz to "mbz" :-)
629 */
630 skb_push(skb, sizeof(struct iphdr));
631 skb_reset_network_header(skb);
632 skb_reset_transport_header(skb);
633 msg = (struct igmpmsg *)skb_network_header(skb);
634 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
635 msg->im_msgtype = IGMPMSG_WHOLEPKT;
636 msg->im_mbz = 0;
637 msg->im_vif = net->ipv4.mroute_reg_vif_num;
638 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
639 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
640 sizeof(struct iphdr));
641 } else
642 #endif
643 {
644
645 /*
646 * Copy the IP header
647 */
648
649 skb->network_header = skb->tail;
650 skb_put(skb, ihl);
651 skb_copy_to_linear_data(skb, pkt->data, ihl);
652 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
653 msg = (struct igmpmsg *)skb_network_header(skb);
654 msg->im_vif = vifi;
655 skb_dst_set(skb, dst_clone(skb_dst(pkt)));
656
657 /*
658 * Add our header
659 */
660
661 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
662 igmp->type =
663 msg->im_msgtype = assert;
664 igmp->code = 0;
665 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
666 skb->transport_header = skb->network_header;
667 }
668
669 if (net->ipv4.mroute_sk == NULL) {
670 kfree_skb(skb);
671 return -EINVAL;
672 }
673
674 /*
675 * Deliver to mrouted
676 */
677 ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb);
678 if (ret < 0) {
679 if (net_ratelimit())
680 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
681 kfree_skb(skb);
682 }
683
684 return ret;
685 }
686
687 /*
688 * Queue a packet for resolution. It gets locked cache entry!
689 */
690
691 static int
692 ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
693 {
694 int err;
695 struct mfc_cache *c;
696 const struct iphdr *iph = ip_hdr(skb);
697
698 spin_lock_bh(&mfc_unres_lock);
699 for (c=mfc_unres_queue; c; c=c->next) {
700 if (net_eq(mfc_net(c), net) &&
701 c->mfc_mcastgrp == iph->daddr &&
702 c->mfc_origin == iph->saddr)
703 break;
704 }
705
706 if (c == NULL) {
707 /*
708 * Create a new entry if allowable
709 */
710
711 if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 ||
712 (c = ipmr_cache_alloc_unres(net)) == NULL) {
713 spin_unlock_bh(&mfc_unres_lock);
714
715 kfree_skb(skb);
716 return -ENOBUFS;
717 }
718
719 /*
720 * Fill in the new cache entry
721 */
722 c->mfc_parent = -1;
723 c->mfc_origin = iph->saddr;
724 c->mfc_mcastgrp = iph->daddr;
725
726 /*
727 * Reflect first query at mrouted.
728 */
729 err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE);
730 if (err < 0) {
731 /* If the report failed throw the cache entry
732 out - Brad Parker
733 */
734 spin_unlock_bh(&mfc_unres_lock);
735
736 ipmr_cache_free(c);
737 kfree_skb(skb);
738 return err;
739 }
740
741 atomic_inc(&net->ipv4.cache_resolve_queue_len);
742 c->next = mfc_unres_queue;
743 mfc_unres_queue = c;
744
745 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
746 }
747
748 /*
749 * See if we can append the packet
750 */
751 if (c->mfc_un.unres.unresolved.qlen>3) {
752 kfree_skb(skb);
753 err = -ENOBUFS;
754 } else {
755 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
756 err = 0;
757 }
758
759 spin_unlock_bh(&mfc_unres_lock);
760 return err;
761 }
762
763 /*
764 * MFC cache manipulation by user space mroute daemon
765 */
766
767 static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
768 {
769 int line;
770 struct mfc_cache *c, **cp;
771
772 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
773
774 for (cp = &net->ipv4.mfc_cache_array[line];
775 (c = *cp) != NULL; cp = &c->next) {
776 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
777 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
778 write_lock_bh(&mrt_lock);
779 *cp = c->next;
780 write_unlock_bh(&mrt_lock);
781
782 ipmr_cache_free(c);
783 return 0;
784 }
785 }
786 return -ENOENT;
787 }
788
789 static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
790 {
791 int line;
792 struct mfc_cache *uc, *c, **cp;
793
794 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
795
796 for (cp = &net->ipv4.mfc_cache_array[line];
797 (c = *cp) != NULL; cp = &c->next) {
798 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
799 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
800 break;
801 }
802
803 if (c != NULL) {
804 write_lock_bh(&mrt_lock);
805 c->mfc_parent = mfc->mfcc_parent;
806 ipmr_update_thresholds(c, mfc->mfcc_ttls);
807 if (!mrtsock)
808 c->mfc_flags |= MFC_STATIC;
809 write_unlock_bh(&mrt_lock);
810 return 0;
811 }
812
813 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
814 return -EINVAL;
815
816 c = ipmr_cache_alloc(net);
817 if (c == NULL)
818 return -ENOMEM;
819
820 c->mfc_origin = mfc->mfcc_origin.s_addr;
821 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
822 c->mfc_parent = mfc->mfcc_parent;
823 ipmr_update_thresholds(c, mfc->mfcc_ttls);
824 if (!mrtsock)
825 c->mfc_flags |= MFC_STATIC;
826
827 write_lock_bh(&mrt_lock);
828 c->next = net->ipv4.mfc_cache_array[line];
829 net->ipv4.mfc_cache_array[line] = c;
830 write_unlock_bh(&mrt_lock);
831
832 /*
833 * Check to see if we resolved a queued list. If so we
834 * need to send on the frames and tidy up.
835 */
836 spin_lock_bh(&mfc_unres_lock);
837 for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
838 cp = &uc->next) {
839 if (net_eq(mfc_net(uc), net) &&
840 uc->mfc_origin == c->mfc_origin &&
841 uc->mfc_mcastgrp == c->mfc_mcastgrp) {
842 *cp = uc->next;
843 atomic_dec(&net->ipv4.cache_resolve_queue_len);
844 break;
845 }
846 }
847 if (mfc_unres_queue == NULL)
848 del_timer(&ipmr_expire_timer);
849 spin_unlock_bh(&mfc_unres_lock);
850
851 if (uc) {
852 ipmr_cache_resolve(uc, c);
853 ipmr_cache_free(uc);
854 }
855 return 0;
856 }
857
858 /*
859 * Close the multicast socket, and clear the vif tables etc
860 */
861
862 static void mroute_clean_tables(struct net *net)
863 {
864 int i;
865
866 /*
867 * Shut down all active vif entries
868 */
869 for (i = 0; i < net->ipv4.maxvif; i++) {
870 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
871 vif_delete(net, i, 0);
872 }
873
874 /*
875 * Wipe the cache
876 */
877 for (i=0; i<MFC_LINES; i++) {
878 struct mfc_cache *c, **cp;
879
880 cp = &net->ipv4.mfc_cache_array[i];
881 while ((c = *cp) != NULL) {
882 if (c->mfc_flags&MFC_STATIC) {
883 cp = &c->next;
884 continue;
885 }
886 write_lock_bh(&mrt_lock);
887 *cp = c->next;
888 write_unlock_bh(&mrt_lock);
889
890 ipmr_cache_free(c);
891 }
892 }
893
894 if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) {
895 struct mfc_cache *c, **cp;
896
897 spin_lock_bh(&mfc_unres_lock);
898 cp = &mfc_unres_queue;
899 while ((c = *cp) != NULL) {
900 if (!net_eq(mfc_net(c), net)) {
901 cp = &c->next;
902 continue;
903 }
904 *cp = c->next;
905
906 ipmr_destroy_unres(c);
907 }
908 spin_unlock_bh(&mfc_unres_lock);
909 }
910 }
911
912 static void mrtsock_destruct(struct sock *sk)
913 {
914 struct net *net = sock_net(sk);
915
916 rtnl_lock();
917 if (sk == net->ipv4.mroute_sk) {
918 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
919
920 write_lock_bh(&mrt_lock);
921 net->ipv4.mroute_sk = NULL;
922 write_unlock_bh(&mrt_lock);
923
924 mroute_clean_tables(net);
925 }
926 rtnl_unlock();
927 }
928
929 /*
930 * Socket options and virtual interface manipulation. The whole
931 * virtual interface system is a complete heap, but unfortunately
932 * that's how BSD mrouted happens to think. Maybe one day with a proper
933 * MOSPF/PIM router set up we can clean this up.
934 */
935
936 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
937 {
938 int ret;
939 struct vifctl vif;
940 struct mfcctl mfc;
941 struct net *net = sock_net(sk);
942
943 if (optname != MRT_INIT) {
944 if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
945 return -EACCES;
946 }
947
948 switch (optname) {
949 case MRT_INIT:
950 if (sk->sk_type != SOCK_RAW ||
951 inet_sk(sk)->num != IPPROTO_IGMP)
952 return -EOPNOTSUPP;
953 if (optlen != sizeof(int))
954 return -ENOPROTOOPT;
955
956 rtnl_lock();
957 if (net->ipv4.mroute_sk) {
958 rtnl_unlock();
959 return -EADDRINUSE;
960 }
961
962 ret = ip_ra_control(sk, 1, mrtsock_destruct);
963 if (ret == 0) {
964 write_lock_bh(&mrt_lock);
965 net->ipv4.mroute_sk = sk;
966 write_unlock_bh(&mrt_lock);
967
968 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
969 }
970 rtnl_unlock();
971 return ret;
972 case MRT_DONE:
973 if (sk != net->ipv4.mroute_sk)
974 return -EACCES;
975 return ip_ra_control(sk, 0, NULL);
976 case MRT_ADD_VIF:
977 case MRT_DEL_VIF:
978 if (optlen != sizeof(vif))
979 return -EINVAL;
980 if (copy_from_user(&vif, optval, sizeof(vif)))
981 return -EFAULT;
982 if (vif.vifc_vifi >= MAXVIFS)
983 return -ENFILE;
984 rtnl_lock();
985 if (optname == MRT_ADD_VIF) {
986 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
987 } else {
988 ret = vif_delete(net, vif.vifc_vifi, 0);
989 }
990 rtnl_unlock();
991 return ret;
992
993 /*
994 * Manipulate the forwarding caches. These live
995 * in a sort of kernel/user symbiosis.
996 */
997 case MRT_ADD_MFC:
998 case MRT_DEL_MFC:
999 if (optlen != sizeof(mfc))
1000 return -EINVAL;
1001 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1002 return -EFAULT;
1003 rtnl_lock();
1004 if (optname == MRT_DEL_MFC)
1005 ret = ipmr_mfc_delete(net, &mfc);
1006 else
1007 ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk);
1008 rtnl_unlock();
1009 return ret;
1010 /*
1011 * Control PIM assert.
1012 */
1013 case MRT_ASSERT:
1014 {
1015 int v;
1016 if (get_user(v,(int __user *)optval))
1017 return -EFAULT;
1018 net->ipv4.mroute_do_assert = (v) ? 1 : 0;
1019 return 0;
1020 }
1021 #ifdef CONFIG_IP_PIMSM
1022 case MRT_PIM:
1023 {
1024 int v;
1025
1026 if (get_user(v,(int __user *)optval))
1027 return -EFAULT;
1028 v = (v) ? 1 : 0;
1029
1030 rtnl_lock();
1031 ret = 0;
1032 if (v != net->ipv4.mroute_do_pim) {
1033 net->ipv4.mroute_do_pim = v;
1034 net->ipv4.mroute_do_assert = v;
1035 }
1036 rtnl_unlock();
1037 return ret;
1038 }
1039 #endif
1040 /*
1041 * Spurious command, or MRT_VERSION which you cannot
1042 * set.
1043 */
1044 default:
1045 return -ENOPROTOOPT;
1046 }
1047 }
1048
1049 /*
1050 * Getsock opt support for the multicast routing system.
1051 */
1052
1053 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1054 {
1055 int olr;
1056 int val;
1057 struct net *net = sock_net(sk);
1058
1059 if (optname != MRT_VERSION &&
1060 #ifdef CONFIG_IP_PIMSM
1061 optname!=MRT_PIM &&
1062 #endif
1063 optname!=MRT_ASSERT)
1064 return -ENOPROTOOPT;
1065
1066 if (get_user(olr, optlen))
1067 return -EFAULT;
1068
1069 olr = min_t(unsigned int, olr, sizeof(int));
1070 if (olr < 0)
1071 return -EINVAL;
1072
1073 if (put_user(olr, optlen))
1074 return -EFAULT;
1075 if (optname == MRT_VERSION)
1076 val = 0x0305;
1077 #ifdef CONFIG_IP_PIMSM
1078 else if (optname == MRT_PIM)
1079 val = net->ipv4.mroute_do_pim;
1080 #endif
1081 else
1082 val = net->ipv4.mroute_do_assert;
1083 if (copy_to_user(optval, &val, olr))
1084 return -EFAULT;
1085 return 0;
1086 }
1087
1088 /*
1089 * The IP multicast ioctl support routines.
1090 */
1091
1092 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1093 {
1094 struct sioc_sg_req sr;
1095 struct sioc_vif_req vr;
1096 struct vif_device *vif;
1097 struct mfc_cache *c;
1098 struct net *net = sock_net(sk);
1099
1100 switch (cmd) {
1101 case SIOCGETVIFCNT:
1102 if (copy_from_user(&vr, arg, sizeof(vr)))
1103 return -EFAULT;
1104 if (vr.vifi >= net->ipv4.maxvif)
1105 return -EINVAL;
1106 read_lock(&mrt_lock);
1107 vif = &net->ipv4.vif_table[vr.vifi];
1108 if (VIF_EXISTS(net, vr.vifi)) {
1109 vr.icount = vif->pkt_in;
1110 vr.ocount = vif->pkt_out;
1111 vr.ibytes = vif->bytes_in;
1112 vr.obytes = vif->bytes_out;
1113 read_unlock(&mrt_lock);
1114
1115 if (copy_to_user(arg, &vr, sizeof(vr)))
1116 return -EFAULT;
1117 return 0;
1118 }
1119 read_unlock(&mrt_lock);
1120 return -EADDRNOTAVAIL;
1121 case SIOCGETSGCNT:
1122 if (copy_from_user(&sr, arg, sizeof(sr)))
1123 return -EFAULT;
1124
1125 read_lock(&mrt_lock);
1126 c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr);
1127 if (c) {
1128 sr.pktcnt = c->mfc_un.res.pkt;
1129 sr.bytecnt = c->mfc_un.res.bytes;
1130 sr.wrong_if = c->mfc_un.res.wrong_if;
1131 read_unlock(&mrt_lock);
1132
1133 if (copy_to_user(arg, &sr, sizeof(sr)))
1134 return -EFAULT;
1135 return 0;
1136 }
1137 read_unlock(&mrt_lock);
1138 return -EADDRNOTAVAIL;
1139 default:
1140 return -ENOIOCTLCMD;
1141 }
1142 }
1143
1144
1145 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1146 {
1147 struct net_device *dev = ptr;
1148 struct net *net = dev_net(dev);
1149 struct vif_device *v;
1150 int ct;
1151
1152 if (!net_eq(dev_net(dev), net))
1153 return NOTIFY_DONE;
1154
1155 if (event != NETDEV_UNREGISTER)
1156 return NOTIFY_DONE;
1157 v = &net->ipv4.vif_table[0];
1158 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
1159 if (v->dev == dev)
1160 vif_delete(net, ct, 1);
1161 }
1162 return NOTIFY_DONE;
1163 }
1164
1165
1166 static struct notifier_block ip_mr_notifier = {
1167 .notifier_call = ipmr_device_event,
1168 };
1169
1170 /*
1171 * Encapsulate a packet by attaching a valid IPIP header to it.
1172 * This avoids tunnel drivers and other mess and gives us the speed so
1173 * important for multicast video.
1174 */
1175
1176 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1177 {
1178 struct iphdr *iph;
1179 struct iphdr *old_iph = ip_hdr(skb);
1180
1181 skb_push(skb, sizeof(struct iphdr));
1182 skb->transport_header = skb->network_header;
1183 skb_reset_network_header(skb);
1184 iph = ip_hdr(skb);
1185
1186 iph->version = 4;
1187 iph->tos = old_iph->tos;
1188 iph->ttl = old_iph->ttl;
1189 iph->frag_off = 0;
1190 iph->daddr = daddr;
1191 iph->saddr = saddr;
1192 iph->protocol = IPPROTO_IPIP;
1193 iph->ihl = 5;
1194 iph->tot_len = htons(skb->len);
1195 ip_select_ident(iph, skb_dst(skb), NULL);
1196 ip_send_check(iph);
1197
1198 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1199 nf_reset(skb);
1200 }
1201
1202 static inline int ipmr_forward_finish(struct sk_buff *skb)
1203 {
1204 struct ip_options * opt = &(IPCB(skb)->opt);
1205
1206 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1207
1208 if (unlikely(opt->optlen))
1209 ip_forward_options(skb);
1210
1211 return dst_output(skb);
1212 }
1213
1214 /*
1215 * Processing handlers for ipmr_forward
1216 */
1217
1218 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1219 {
1220 struct net *net = mfc_net(c);
1221 const struct iphdr *iph = ip_hdr(skb);
1222 struct vif_device *vif = &net->ipv4.vif_table[vifi];
1223 struct net_device *dev;
1224 struct rtable *rt;
1225 int encap = 0;
1226
1227 if (vif->dev == NULL)
1228 goto out_free;
1229
1230 #ifdef CONFIG_IP_PIMSM
1231 if (vif->flags & VIFF_REGISTER) {
1232 vif->pkt_out++;
1233 vif->bytes_out += skb->len;
1234 vif->dev->stats.tx_bytes += skb->len;
1235 vif->dev->stats.tx_packets++;
1236 ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT);
1237 goto out_free;
1238 }
1239 #endif
1240
1241 if (vif->flags&VIFF_TUNNEL) {
1242 struct flowi fl = { .oif = vif->link,
1243 .nl_u = { .ip4_u =
1244 { .daddr = vif->remote,
1245 .saddr = vif->local,
1246 .tos = RT_TOS(iph->tos) } },
1247 .proto = IPPROTO_IPIP };
1248 if (ip_route_output_key(net, &rt, &fl))
1249 goto out_free;
1250 encap = sizeof(struct iphdr);
1251 } else {
1252 struct flowi fl = { .oif = vif->link,
1253 .nl_u = { .ip4_u =
1254 { .daddr = iph->daddr,
1255 .tos = RT_TOS(iph->tos) } },
1256 .proto = IPPROTO_IPIP };
1257 if (ip_route_output_key(net, &rt, &fl))
1258 goto out_free;
1259 }
1260
1261 dev = rt->u.dst.dev;
1262
1263 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1264 /* Do not fragment multicasts. Alas, IPv4 does not
1265 allow to send ICMP, so that packets will disappear
1266 to blackhole.
1267 */
1268
1269 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1270 ip_rt_put(rt);
1271 goto out_free;
1272 }
1273
1274 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1275
1276 if (skb_cow(skb, encap)) {
1277 ip_rt_put(rt);
1278 goto out_free;
1279 }
1280
1281 vif->pkt_out++;
1282 vif->bytes_out += skb->len;
1283
1284 skb_dst_drop(skb);
1285 skb_dst_set(skb, &rt->u.dst);
1286 ip_decrease_ttl(ip_hdr(skb));
1287
1288 /* FIXME: forward and output firewalls used to be called here.
1289 * What do we do with netfilter? -- RR */
1290 if (vif->flags & VIFF_TUNNEL) {
1291 ip_encap(skb, vif->local, vif->remote);
1292 /* FIXME: extra output firewall step used to be here. --RR */
1293 vif->dev->stats.tx_packets++;
1294 vif->dev->stats.tx_bytes += skb->len;
1295 }
1296
1297 IPCB(skb)->flags |= IPSKB_FORWARDED;
1298
1299 /*
1300 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1301 * not only before forwarding, but after forwarding on all output
1302 * interfaces. It is clear, if mrouter runs a multicasting
1303 * program, it should receive packets not depending to what interface
1304 * program is joined.
1305 * If we will not make it, the program will have to join on all
1306 * interfaces. On the other hand, multihoming host (or router, but
1307 * not mrouter) cannot join to more than one interface - it will
1308 * result in receiving multiple packets.
1309 */
1310 NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1311 ipmr_forward_finish);
1312 return;
1313
1314 out_free:
1315 kfree_skb(skb);
1316 return;
1317 }
1318
1319 static int ipmr_find_vif(struct net_device *dev)
1320 {
1321 struct net *net = dev_net(dev);
1322 int ct;
1323 for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) {
1324 if (net->ipv4.vif_table[ct].dev == dev)
1325 break;
1326 }
1327 return ct;
1328 }
1329
1330 /* "local" means that we should preserve one skb (for local delivery) */
1331
1332 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1333 {
1334 int psend = -1;
1335 int vif, ct;
1336 struct net *net = mfc_net(cache);
1337
1338 vif = cache->mfc_parent;
1339 cache->mfc_un.res.pkt++;
1340 cache->mfc_un.res.bytes += skb->len;
1341
1342 /*
1343 * Wrong interface: drop packet and (maybe) send PIM assert.
1344 */
1345 if (net->ipv4.vif_table[vif].dev != skb->dev) {
1346 int true_vifi;
1347
1348 if (skb_rtable(skb)->fl.iif == 0) {
1349 /* It is our own packet, looped back.
1350 Very complicated situation...
1351
1352 The best workaround until routing daemons will be
1353 fixed is not to redistribute packet, if it was
1354 send through wrong interface. It means, that
1355 multicast applications WILL NOT work for
1356 (S,G), which have default multicast route pointing
1357 to wrong oif. In any case, it is not a good
1358 idea to use multicasting applications on router.
1359 */
1360 goto dont_forward;
1361 }
1362
1363 cache->mfc_un.res.wrong_if++;
1364 true_vifi = ipmr_find_vif(skb->dev);
1365
1366 if (true_vifi >= 0 && net->ipv4.mroute_do_assert &&
1367 /* pimsm uses asserts, when switching from RPT to SPT,
1368 so that we cannot check that packet arrived on an oif.
1369 It is bad, but otherwise we would need to move pretty
1370 large chunk of pimd to kernel. Ough... --ANK
1371 */
1372 (net->ipv4.mroute_do_pim ||
1373 cache->mfc_un.res.ttls[true_vifi] < 255) &&
1374 time_after(jiffies,
1375 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1376 cache->mfc_un.res.last_assert = jiffies;
1377 ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF);
1378 }
1379 goto dont_forward;
1380 }
1381
1382 net->ipv4.vif_table[vif].pkt_in++;
1383 net->ipv4.vif_table[vif].bytes_in += skb->len;
1384
1385 /*
1386 * Forward the frame
1387 */
1388 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1389 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1390 if (psend != -1) {
1391 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1392 if (skb2)
1393 ipmr_queue_xmit(skb2, cache, psend);
1394 }
1395 psend = ct;
1396 }
1397 }
1398 if (psend != -1) {
1399 if (local) {
1400 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1401 if (skb2)
1402 ipmr_queue_xmit(skb2, cache, psend);
1403 } else {
1404 ipmr_queue_xmit(skb, cache, psend);
1405 return 0;
1406 }
1407 }
1408
1409 dont_forward:
1410 if (!local)
1411 kfree_skb(skb);
1412 return 0;
1413 }
1414
1415
1416 /*
1417 * Multicast packets for forwarding arrive here
1418 */
1419
1420 int ip_mr_input(struct sk_buff *skb)
1421 {
1422 struct mfc_cache *cache;
1423 struct net *net = dev_net(skb->dev);
1424 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1425
1426 /* Packet is looped back after forward, it should not be
1427 forwarded second time, but still can be delivered locally.
1428 */
1429 if (IPCB(skb)->flags&IPSKB_FORWARDED)
1430 goto dont_forward;
1431
1432 if (!local) {
1433 if (IPCB(skb)->opt.router_alert) {
1434 if (ip_call_ra_chain(skb))
1435 return 0;
1436 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1437 /* IGMPv1 (and broken IGMPv2 implementations sort of
1438 Cisco IOS <= 11.2(8)) do not put router alert
1439 option to IGMP packets destined to routable
1440 groups. It is very bad, because it means
1441 that we can forward NO IGMP messages.
1442 */
1443 read_lock(&mrt_lock);
1444 if (net->ipv4.mroute_sk) {
1445 nf_reset(skb);
1446 raw_rcv(net->ipv4.mroute_sk, skb);
1447 read_unlock(&mrt_lock);
1448 return 0;
1449 }
1450 read_unlock(&mrt_lock);
1451 }
1452 }
1453
1454 read_lock(&mrt_lock);
1455 cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1456
1457 /*
1458 * No usable cache entry
1459 */
1460 if (cache == NULL) {
1461 int vif;
1462
1463 if (local) {
1464 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1465 ip_local_deliver(skb);
1466 if (skb2 == NULL) {
1467 read_unlock(&mrt_lock);
1468 return -ENOBUFS;
1469 }
1470 skb = skb2;
1471 }
1472
1473 vif = ipmr_find_vif(skb->dev);
1474 if (vif >= 0) {
1475 int err = ipmr_cache_unresolved(net, vif, skb);
1476 read_unlock(&mrt_lock);
1477
1478 return err;
1479 }
1480 read_unlock(&mrt_lock);
1481 kfree_skb(skb);
1482 return -ENODEV;
1483 }
1484
1485 ip_mr_forward(skb, cache, local);
1486
1487 read_unlock(&mrt_lock);
1488
1489 if (local)
1490 return ip_local_deliver(skb);
1491
1492 return 0;
1493
1494 dont_forward:
1495 if (local)
1496 return ip_local_deliver(skb);
1497 kfree_skb(skb);
1498 return 0;
1499 }
1500
1501 #ifdef CONFIG_IP_PIMSM
1502 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1503 {
1504 struct net_device *reg_dev = NULL;
1505 struct iphdr *encap;
1506 struct net *net = dev_net(skb->dev);
1507
1508 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1509 /*
1510 Check that:
1511 a. packet is really destinted to a multicast group
1512 b. packet is not a NULL-REGISTER
1513 c. packet is not truncated
1514 */
1515 if (!ipv4_is_multicast(encap->daddr) ||
1516 encap->tot_len == 0 ||
1517 ntohs(encap->tot_len) + pimlen > skb->len)
1518 return 1;
1519
1520 read_lock(&mrt_lock);
1521 if (net->ipv4.mroute_reg_vif_num >= 0)
1522 reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev;
1523 if (reg_dev)
1524 dev_hold(reg_dev);
1525 read_unlock(&mrt_lock);
1526
1527 if (reg_dev == NULL)
1528 return 1;
1529
1530 skb->mac_header = skb->network_header;
1531 skb_pull(skb, (u8*)encap - skb->data);
1532 skb_reset_network_header(skb);
1533 skb->dev = reg_dev;
1534 skb->protocol = htons(ETH_P_IP);
1535 skb->ip_summed = 0;
1536 skb->pkt_type = PACKET_HOST;
1537 skb_dst_drop(skb);
1538 reg_dev->stats.rx_bytes += skb->len;
1539 reg_dev->stats.rx_packets++;
1540 nf_reset(skb);
1541 netif_rx(skb);
1542 dev_put(reg_dev);
1543
1544 return 0;
1545 }
1546 #endif
1547
1548 #ifdef CONFIG_IP_PIMSM_V1
1549 /*
1550 * Handle IGMP messages of PIMv1
1551 */
1552
1553 int pim_rcv_v1(struct sk_buff * skb)
1554 {
1555 struct igmphdr *pim;
1556 struct net *net = dev_net(skb->dev);
1557
1558 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1559 goto drop;
1560
1561 pim = igmp_hdr(skb);
1562
1563 if (!net->ipv4.mroute_do_pim ||
1564 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1565 goto drop;
1566
1567 if (__pim_rcv(skb, sizeof(*pim))) {
1568 drop:
1569 kfree_skb(skb);
1570 }
1571 return 0;
1572 }
1573 #endif
1574
1575 #ifdef CONFIG_IP_PIMSM_V2
1576 static int pim_rcv(struct sk_buff * skb)
1577 {
1578 struct pimreghdr *pim;
1579
1580 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1581 goto drop;
1582
1583 pim = (struct pimreghdr *)skb_transport_header(skb);
1584 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1585 (pim->flags&PIM_NULL_REGISTER) ||
1586 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1587 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1588 goto drop;
1589
1590 if (__pim_rcv(skb, sizeof(*pim))) {
1591 drop:
1592 kfree_skb(skb);
1593 }
1594 return 0;
1595 }
1596 #endif
1597
1598 static int
1599 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1600 {
1601 int ct;
1602 struct rtnexthop *nhp;
1603 struct net *net = mfc_net(c);
1604 struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
1605 u8 *b = skb_tail_pointer(skb);
1606 struct rtattr *mp_head;
1607
1608 if (dev)
1609 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1610
1611 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1612
1613 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1614 if (c->mfc_un.res.ttls[ct] < 255) {
1615 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1616 goto rtattr_failure;
1617 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1618 nhp->rtnh_flags = 0;
1619 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1620 nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex;
1621 nhp->rtnh_len = sizeof(*nhp);
1622 }
1623 }
1624 mp_head->rta_type = RTA_MULTIPATH;
1625 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1626 rtm->rtm_type = RTN_MULTICAST;
1627 return 1;
1628
1629 rtattr_failure:
1630 nlmsg_trim(skb, b);
1631 return -EMSGSIZE;
1632 }
1633
1634 int ipmr_get_route(struct net *net,
1635 struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1636 {
1637 int err;
1638 struct mfc_cache *cache;
1639 struct rtable *rt = skb_rtable(skb);
1640
1641 read_lock(&mrt_lock);
1642 cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
1643
1644 if (cache == NULL) {
1645 struct sk_buff *skb2;
1646 struct iphdr *iph;
1647 struct net_device *dev;
1648 int vif;
1649
1650 if (nowait) {
1651 read_unlock(&mrt_lock);
1652 return -EAGAIN;
1653 }
1654
1655 dev = skb->dev;
1656 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1657 read_unlock(&mrt_lock);
1658 return -ENODEV;
1659 }
1660 skb2 = skb_clone(skb, GFP_ATOMIC);
1661 if (!skb2) {
1662 read_unlock(&mrt_lock);
1663 return -ENOMEM;
1664 }
1665
1666 skb_push(skb2, sizeof(struct iphdr));
1667 skb_reset_network_header(skb2);
1668 iph = ip_hdr(skb2);
1669 iph->ihl = sizeof(struct iphdr) >> 2;
1670 iph->saddr = rt->rt_src;
1671 iph->daddr = rt->rt_dst;
1672 iph->version = 0;
1673 err = ipmr_cache_unresolved(net, vif, skb2);
1674 read_unlock(&mrt_lock);
1675 return err;
1676 }
1677
1678 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1679 cache->mfc_flags |= MFC_NOTIFY;
1680 err = ipmr_fill_mroute(skb, cache, rtm);
1681 read_unlock(&mrt_lock);
1682 return err;
1683 }
1684
1685 #ifdef CONFIG_PROC_FS
1686 /*
1687 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1688 */
1689 struct ipmr_vif_iter {
1690 struct seq_net_private p;
1691 int ct;
1692 };
1693
1694 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
1695 struct ipmr_vif_iter *iter,
1696 loff_t pos)
1697 {
1698 for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) {
1699 if (!VIF_EXISTS(net, iter->ct))
1700 continue;
1701 if (pos-- == 0)
1702 return &net->ipv4.vif_table[iter->ct];
1703 }
1704 return NULL;
1705 }
1706
1707 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1708 __acquires(mrt_lock)
1709 {
1710 struct net *net = seq_file_net(seq);
1711
1712 read_lock(&mrt_lock);
1713 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
1714 : SEQ_START_TOKEN;
1715 }
1716
1717 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1718 {
1719 struct ipmr_vif_iter *iter = seq->private;
1720 struct net *net = seq_file_net(seq);
1721
1722 ++*pos;
1723 if (v == SEQ_START_TOKEN)
1724 return ipmr_vif_seq_idx(net, iter, 0);
1725
1726 while (++iter->ct < net->ipv4.maxvif) {
1727 if (!VIF_EXISTS(net, iter->ct))
1728 continue;
1729 return &net->ipv4.vif_table[iter->ct];
1730 }
1731 return NULL;
1732 }
1733
1734 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1735 __releases(mrt_lock)
1736 {
1737 read_unlock(&mrt_lock);
1738 }
1739
1740 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1741 {
1742 struct net *net = seq_file_net(seq);
1743
1744 if (v == SEQ_START_TOKEN) {
1745 seq_puts(seq,
1746 "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
1747 } else {
1748 const struct vif_device *vif = v;
1749 const char *name = vif->dev ? vif->dev->name : "none";
1750
1751 seq_printf(seq,
1752 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
1753 vif - net->ipv4.vif_table,
1754 name, vif->bytes_in, vif->pkt_in,
1755 vif->bytes_out, vif->pkt_out,
1756 vif->flags, vif->local, vif->remote);
1757 }
1758 return 0;
1759 }
1760
1761 static const struct seq_operations ipmr_vif_seq_ops = {
1762 .start = ipmr_vif_seq_start,
1763 .next = ipmr_vif_seq_next,
1764 .stop = ipmr_vif_seq_stop,
1765 .show = ipmr_vif_seq_show,
1766 };
1767
1768 static int ipmr_vif_open(struct inode *inode, struct file *file)
1769 {
1770 return seq_open_net(inode, file, &ipmr_vif_seq_ops,
1771 sizeof(struct ipmr_vif_iter));
1772 }
1773
1774 static const struct file_operations ipmr_vif_fops = {
1775 .owner = THIS_MODULE,
1776 .open = ipmr_vif_open,
1777 .read = seq_read,
1778 .llseek = seq_lseek,
1779 .release = seq_release_net,
1780 };
1781
1782 struct ipmr_mfc_iter {
1783 struct seq_net_private p;
1784 struct mfc_cache **cache;
1785 int ct;
1786 };
1787
1788
1789 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
1790 struct ipmr_mfc_iter *it, loff_t pos)
1791 {
1792 struct mfc_cache *mfc;
1793
1794 it->cache = net->ipv4.mfc_cache_array;
1795 read_lock(&mrt_lock);
1796 for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1797 for (mfc = net->ipv4.mfc_cache_array[it->ct];
1798 mfc; mfc = mfc->next)
1799 if (pos-- == 0)
1800 return mfc;
1801 read_unlock(&mrt_lock);
1802
1803 it->cache = &mfc_unres_queue;
1804 spin_lock_bh(&mfc_unres_lock);
1805 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1806 if (net_eq(mfc_net(mfc), net) &&
1807 pos-- == 0)
1808 return mfc;
1809 spin_unlock_bh(&mfc_unres_lock);
1810
1811 it->cache = NULL;
1812 return NULL;
1813 }
1814
1815
1816 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1817 {
1818 struct ipmr_mfc_iter *it = seq->private;
1819 struct net *net = seq_file_net(seq);
1820
1821 it->cache = NULL;
1822 it->ct = 0;
1823 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
1824 : SEQ_START_TOKEN;
1825 }
1826
1827 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1828 {
1829 struct mfc_cache *mfc = v;
1830 struct ipmr_mfc_iter *it = seq->private;
1831 struct net *net = seq_file_net(seq);
1832
1833 ++*pos;
1834
1835 if (v == SEQ_START_TOKEN)
1836 return ipmr_mfc_seq_idx(net, seq->private, 0);
1837
1838 if (mfc->next)
1839 return mfc->next;
1840
1841 if (it->cache == &mfc_unres_queue)
1842 goto end_of_list;
1843
1844 BUG_ON(it->cache != net->ipv4.mfc_cache_array);
1845
1846 while (++it->ct < MFC_LINES) {
1847 mfc = net->ipv4.mfc_cache_array[it->ct];
1848 if (mfc)
1849 return mfc;
1850 }
1851
1852 /* exhausted cache_array, show unresolved */
1853 read_unlock(&mrt_lock);
1854 it->cache = &mfc_unres_queue;
1855 it->ct = 0;
1856
1857 spin_lock_bh(&mfc_unres_lock);
1858 mfc = mfc_unres_queue;
1859 while (mfc && !net_eq(mfc_net(mfc), net))
1860 mfc = mfc->next;
1861 if (mfc)
1862 return mfc;
1863
1864 end_of_list:
1865 spin_unlock_bh(&mfc_unres_lock);
1866 it->cache = NULL;
1867
1868 return NULL;
1869 }
1870
1871 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1872 {
1873 struct ipmr_mfc_iter *it = seq->private;
1874 struct net *net = seq_file_net(seq);
1875
1876 if (it->cache == &mfc_unres_queue)
1877 spin_unlock_bh(&mfc_unres_lock);
1878 else if (it->cache == net->ipv4.mfc_cache_array)
1879 read_unlock(&mrt_lock);
1880 }
1881
1882 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1883 {
1884 int n;
1885 struct net *net = seq_file_net(seq);
1886
1887 if (v == SEQ_START_TOKEN) {
1888 seq_puts(seq,
1889 "Group Origin Iif Pkts Bytes Wrong Oifs\n");
1890 } else {
1891 const struct mfc_cache *mfc = v;
1892 const struct ipmr_mfc_iter *it = seq->private;
1893
1894 seq_printf(seq, "%08lX %08lX %-3hd",
1895 (unsigned long) mfc->mfc_mcastgrp,
1896 (unsigned long) mfc->mfc_origin,
1897 mfc->mfc_parent);
1898
1899 if (it->cache != &mfc_unres_queue) {
1900 seq_printf(seq, " %8lu %8lu %8lu",
1901 mfc->mfc_un.res.pkt,
1902 mfc->mfc_un.res.bytes,
1903 mfc->mfc_un.res.wrong_if);
1904 for (n = mfc->mfc_un.res.minvif;
1905 n < mfc->mfc_un.res.maxvif; n++ ) {
1906 if (VIF_EXISTS(net, n) &&
1907 mfc->mfc_un.res.ttls[n] < 255)
1908 seq_printf(seq,
1909 " %2d:%-3d",
1910 n, mfc->mfc_un.res.ttls[n]);
1911 }
1912 } else {
1913 /* unresolved mfc_caches don't contain
1914 * pkt, bytes and wrong_if values
1915 */
1916 seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1917 }
1918 seq_putc(seq, '\n');
1919 }
1920 return 0;
1921 }
1922
1923 static const struct seq_operations ipmr_mfc_seq_ops = {
1924 .start = ipmr_mfc_seq_start,
1925 .next = ipmr_mfc_seq_next,
1926 .stop = ipmr_mfc_seq_stop,
1927 .show = ipmr_mfc_seq_show,
1928 };
1929
1930 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1931 {
1932 return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
1933 sizeof(struct ipmr_mfc_iter));
1934 }
1935
1936 static const struct file_operations ipmr_mfc_fops = {
1937 .owner = THIS_MODULE,
1938 .open = ipmr_mfc_open,
1939 .read = seq_read,
1940 .llseek = seq_lseek,
1941 .release = seq_release_net,
1942 };
1943 #endif
1944
1945 #ifdef CONFIG_IP_PIMSM_V2
1946 static const struct net_protocol pim_protocol = {
1947 .handler = pim_rcv,
1948 .netns_ok = 1,
1949 };
1950 #endif
1951
1952
1953 /*
1954 * Setup for IP multicast routing
1955 */
1956 static int __net_init ipmr_net_init(struct net *net)
1957 {
1958 int err = 0;
1959
1960 net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1961 GFP_KERNEL);
1962 if (!net->ipv4.vif_table) {
1963 err = -ENOMEM;
1964 goto fail;
1965 }
1966
1967 /* Forwarding cache */
1968 net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1969 sizeof(struct mfc_cache *),
1970 GFP_KERNEL);
1971 if (!net->ipv4.mfc_cache_array) {
1972 err = -ENOMEM;
1973 goto fail_mfc_cache;
1974 }
1975
1976 #ifdef CONFIG_IP_PIMSM
1977 net->ipv4.mroute_reg_vif_num = -1;
1978 #endif
1979
1980 #ifdef CONFIG_PROC_FS
1981 err = -ENOMEM;
1982 if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
1983 goto proc_vif_fail;
1984 if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1985 goto proc_cache_fail;
1986 #endif
1987 return 0;
1988
1989 #ifdef CONFIG_PROC_FS
1990 proc_cache_fail:
1991 proc_net_remove(net, "ip_mr_vif");
1992 proc_vif_fail:
1993 kfree(net->ipv4.mfc_cache_array);
1994 #endif
1995 fail_mfc_cache:
1996 kfree(net->ipv4.vif_table);
1997 fail:
1998 return err;
1999 }
2000
2001 static void __net_exit ipmr_net_exit(struct net *net)
2002 {
2003 #ifdef CONFIG_PROC_FS
2004 proc_net_remove(net, "ip_mr_cache");
2005 proc_net_remove(net, "ip_mr_vif");
2006 #endif
2007 kfree(net->ipv4.mfc_cache_array);
2008 kfree(net->ipv4.vif_table);
2009 }
2010
2011 static struct pernet_operations ipmr_net_ops = {
2012 .init = ipmr_net_init,
2013 .exit = ipmr_net_exit,
2014 };
2015
2016 int __init ip_mr_init(void)
2017 {
2018 int err;
2019
2020 mrt_cachep = kmem_cache_create("ip_mrt_cache",
2021 sizeof(struct mfc_cache),
2022 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2023 NULL);
2024 if (!mrt_cachep)
2025 return -ENOMEM;
2026
2027 err = register_pernet_subsys(&ipmr_net_ops);
2028 if (err)
2029 goto reg_pernet_fail;
2030
2031 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
2032 err = register_netdevice_notifier(&ip_mr_notifier);
2033 if (err)
2034 goto reg_notif_fail;
2035 #ifdef CONFIG_IP_PIMSM_V2
2036 if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2037 printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2038 err = -EAGAIN;
2039 goto add_proto_fail;
2040 }
2041 #endif
2042 return 0;
2043
2044 #ifdef CONFIG_IP_PIMSM_V2
2045 add_proto_fail:
2046 unregister_netdevice_notifier(&ip_mr_notifier);
2047 #endif
2048 reg_notif_fail:
2049 del_timer(&ipmr_expire_timer);
2050 unregister_pernet_subsys(&ipmr_net_ops);
2051 reg_pernet_fail:
2052 kmem_cache_destroy(mrt_cachep);
2053 return err;
2054 }