[IPv4]: Use rtnl registration interface
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4
LT
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/rtnetlink.h>
31#include <linux/init.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/kmod.h>
35#include <linux/list.h>
36#include <linux/bitops.h>
4179477f 37#include <linux/hrtimer.h>
1da177e4 38
dc5fc579 39#include <net/netlink.h>
1da177e4
LT
40#include <net/sock.h>
41#include <net/pkt_sched.h>
42
43#include <asm/processor.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46
47static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
51
52/*
53
54 Short review.
55 -------------
56
57 This file consists of two interrelated parts:
58
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
61
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
66
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
71
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
74
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
80
81 All real intelligent work is done inside qdisc modules.
82
83
84
85 Every discipline has two major routines: enqueue and dequeue.
86
87 ---dequeue
88
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
95
96 ---enqueue
97
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
107
108 Auxiliary routines:
109
110 ---requeue
111
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
114
115 ---reset
116
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
119
120 ---init
121
122 initializes newly created qdisc.
123
124 ---destroy
125
126 destroys resources allocated by init and during lifetime of qdisc.
127
128 ---change
129
130 changes qdisc parameters.
131 */
132
133/* Protects list of registered TC modules. It is pure SMP lock. */
134static DEFINE_RWLOCK(qdisc_mod_lock);
135
136
137/************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
140
141
142/* The list of all installed queueing disciplines. */
143
144static struct Qdisc_ops *qdisc_base;
145
146/* Register/uregister queueing discipline */
147
148int register_qdisc(struct Qdisc_ops *qops)
149{
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
152
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
157
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
164
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
171}
172
173int unregister_qdisc(struct Qdisc_ops *qops)
174{
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
177
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
186 }
187 write_unlock(&qdisc_mod_lock);
188 return err;
189}
190
191/* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
193 */
194
43effa1e 195static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4
LT
196{
197 struct Qdisc *q;
198
1da177e4 199 list_for_each_entry(q, &dev->qdisc_list, list) {
43effa1e 200 if (q->handle == handle)
1da177e4 201 return q;
1da177e4 202 }
1da177e4
LT
203 return NULL;
204}
205
43effa1e
PM
206struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
207{
208 struct Qdisc *q;
209
210 read_lock(&qdisc_tree_lock);
211 q = __qdisc_lookup(dev, handle);
212 read_unlock(&qdisc_tree_lock);
213 return q;
214}
215
1da177e4
LT
216static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
217{
218 unsigned long cl;
219 struct Qdisc *leaf;
220 struct Qdisc_class_ops *cops = p->ops->cl_ops;
221
222 if (cops == NULL)
223 return NULL;
224 cl = cops->get(p, classid);
225
226 if (cl == 0)
227 return NULL;
228 leaf = cops->leaf(p, cl);
229 cops->put(p, cl);
230 return leaf;
231}
232
233/* Find queueing discipline by name */
234
235static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
236{
237 struct Qdisc_ops *q = NULL;
238
239 if (kind) {
240 read_lock(&qdisc_mod_lock);
241 for (q = qdisc_base; q; q = q->next) {
242 if (rtattr_strcmp(kind, q->id) == 0) {
243 if (!try_module_get(q->owner))
244 q = NULL;
245 break;
246 }
247 }
248 read_unlock(&qdisc_mod_lock);
249 }
250 return q;
251}
252
253static struct qdisc_rate_table *qdisc_rtab_list;
254
255struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
256{
257 struct qdisc_rate_table *rtab;
258
259 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
260 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
261 rtab->refcnt++;
262 return rtab;
263 }
264 }
265
266 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
267 return NULL;
268
269 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
270 if (rtab) {
271 rtab->rate = *r;
272 rtab->refcnt = 1;
273 memcpy(rtab->data, RTA_DATA(tab), 1024);
274 rtab->next = qdisc_rtab_list;
275 qdisc_rtab_list = rtab;
276 }
277 return rtab;
278}
279
280void qdisc_put_rtab(struct qdisc_rate_table *tab)
281{
282 struct qdisc_rate_table *rtab, **rtabp;
283
284 if (!tab || --tab->refcnt)
285 return;
286
287 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
288 if (rtab == tab) {
289 *rtabp = rtab->next;
290 kfree(rtab);
291 return;
292 }
293 }
294}
295
4179477f
PM
296static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
297{
298 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
299 timer);
300
301 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
302 netif_schedule(wd->qdisc->dev);
303 return HRTIMER_NORESTART;
304}
305
306void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
307{
308 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
309 wd->timer.function = qdisc_watchdog;
310 wd->qdisc = qdisc;
311}
312EXPORT_SYMBOL(qdisc_watchdog_init);
313
314void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
315{
316 ktime_t time;
317
318 wd->qdisc->flags |= TCQ_F_THROTTLED;
319 time = ktime_set(0, 0);
320 time = ktime_add_ns(time, PSCHED_US2NS(expires));
321 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
322}
323EXPORT_SYMBOL(qdisc_watchdog_schedule);
324
325void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
326{
327 hrtimer_cancel(&wd->timer);
328 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
329}
330EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4
LT
331
332/* Allocate an unique handle from space managed by kernel */
333
334static u32 qdisc_alloc_handle(struct net_device *dev)
335{
336 int i = 0x10000;
337 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
338
339 do {
340 autohandle += TC_H_MAKE(0x10000U, 0);
341 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
342 autohandle = TC_H_MAKE(0x80000000U, 0);
343 } while (qdisc_lookup(dev, autohandle) && --i > 0);
344
345 return i>0 ? autohandle : 0;
346}
347
348/* Attach toplevel qdisc to device dev */
349
350static struct Qdisc *
351dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
352{
353 struct Qdisc *oqdisc;
354
355 if (dev->flags & IFF_UP)
356 dev_deactivate(dev);
357
358 qdisc_lock_tree(dev);
359 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
360 oqdisc = dev->qdisc_ingress;
361 /* Prune old scheduler */
362 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
363 /* delete */
364 qdisc_reset(oqdisc);
365 dev->qdisc_ingress = NULL;
366 } else { /* new */
367 dev->qdisc_ingress = qdisc;
368 }
369
370 } else {
371
372 oqdisc = dev->qdisc_sleeping;
373
374 /* Prune old scheduler */
375 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
376 qdisc_reset(oqdisc);
377
378 /* ... and graft new one */
379 if (qdisc == NULL)
380 qdisc = &noop_qdisc;
381 dev->qdisc_sleeping = qdisc;
382 dev->qdisc = &noop_qdisc;
383 }
384
385 qdisc_unlock_tree(dev);
386
387 if (dev->flags & IFF_UP)
388 dev_activate(dev);
389
390 return oqdisc;
391}
392
43effa1e
PM
393void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
394{
395 struct Qdisc_class_ops *cops;
396 unsigned long cl;
397 u32 parentid;
398
399 if (n == 0)
400 return;
401 while ((parentid = sch->parent)) {
402 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
403 cops = sch->ops->cl_ops;
404 if (cops->qlen_notify) {
405 cl = cops->get(sch, parentid);
406 cops->qlen_notify(sch, cl);
407 cops->put(sch, cl);
408 }
409 sch->q.qlen -= n;
410 }
411}
412EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4
LT
413
414/* Graft qdisc "new" to class "classid" of qdisc "parent" or
415 to device "dev".
416
417 Old qdisc is not destroyed but returned in *old.
418 */
419
420static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
421 u32 classid,
422 struct Qdisc *new, struct Qdisc **old)
423{
424 int err = 0;
425 struct Qdisc *q = *old;
426
427
10297b99 428 if (parent == NULL) {
1da177e4
LT
429 if (q && q->flags&TCQ_F_INGRESS) {
430 *old = dev_graft_qdisc(dev, q);
431 } else {
432 *old = dev_graft_qdisc(dev, new);
433 }
434 } else {
435 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
436
437 err = -EINVAL;
438
439 if (cops) {
440 unsigned long cl = cops->get(parent, classid);
441 if (cl) {
442 err = cops->graft(parent, cl, new, old);
443 if (new)
444 new->parent = classid;
445 cops->put(parent, cl);
446 }
447 }
448 }
449 return err;
450}
451
452/*
453 Allocate and initialize new qdisc.
454
455 Parameters are passed via opt.
456 */
457
458static struct Qdisc *
459qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
460{
461 int err;
462 struct rtattr *kind = tca[TCA_KIND-1];
1da177e4
LT
463 struct Qdisc *sch;
464 struct Qdisc_ops *ops;
1da177e4
LT
465
466 ops = qdisc_lookup_ops(kind);
467#ifdef CONFIG_KMOD
468 if (ops == NULL && kind != NULL) {
469 char name[IFNAMSIZ];
470 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
471 /* We dropped the RTNL semaphore in order to
472 * perform the module load. So, even if we
473 * succeeded in loading the module we have to
474 * tell the caller to replay the request. We
475 * indicate this using -EAGAIN.
476 * We replay the request because the device may
477 * go away in the mean time.
478 */
479 rtnl_unlock();
480 request_module("sch_%s", name);
481 rtnl_lock();
482 ops = qdisc_lookup_ops(kind);
483 if (ops != NULL) {
484 /* We will try again qdisc_lookup_ops,
485 * so don't keep a reference.
486 */
487 module_put(ops->owner);
488 err = -EAGAIN;
489 goto err_out;
490 }
491 }
492 }
493#endif
494
b9e2cc0f 495 err = -ENOENT;
1da177e4
LT
496 if (ops == NULL)
497 goto err_out;
498
3d54b82f
TG
499 sch = qdisc_alloc(dev, ops);
500 if (IS_ERR(sch)) {
501 err = PTR_ERR(sch);
1da177e4 502 goto err_out2;
3d54b82f 503 }
1da177e4 504
3d54b82f 505 if (handle == TC_H_INGRESS) {
1da177e4 506 sch->flags |= TCQ_F_INGRESS;
3d54b82f
TG
507 handle = TC_H_MAKE(TC_H_INGRESS, 0);
508 } else if (handle == 0) {
1da177e4
LT
509 handle = qdisc_alloc_handle(dev);
510 err = -ENOMEM;
511 if (handle == 0)
512 goto err_out3;
513 }
514
3d54b82f 515 sch->handle = handle;
1da177e4
LT
516
517 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
023e09a7
TG
518#ifdef CONFIG_NET_ESTIMATOR
519 if (tca[TCA_RATE-1]) {
520 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
521 sch->stats_lock,
522 tca[TCA_RATE-1]);
523 if (err) {
524 /*
525 * Any broken qdiscs that would require
526 * a ops->reset() here? The qdisc was never
527 * in action so it shouldn't be necessary.
528 */
529 if (ops->destroy)
530 ops->destroy(sch);
531 goto err_out3;
532 }
533 }
534#endif
1da177e4
LT
535 qdisc_lock_tree(dev);
536 list_add_tail(&sch->list, &dev->qdisc_list);
537 qdisc_unlock_tree(dev);
538
1da177e4
LT
539 return sch;
540 }
541err_out3:
542 dev_put(dev);
3d54b82f 543 kfree((char *) sch - sch->padded);
1da177e4
LT
544err_out2:
545 module_put(ops->owner);
546err_out:
547 *errp = err;
1da177e4
LT
548 return NULL;
549}
550
551static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
552{
553 if (tca[TCA_OPTIONS-1]) {
554 int err;
555
556 if (sch->ops->change == NULL)
557 return -EINVAL;
558 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
559 if (err)
560 return err;
561 }
562#ifdef CONFIG_NET_ESTIMATOR
563 if (tca[TCA_RATE-1])
564 gen_replace_estimator(&sch->bstats, &sch->rate_est,
565 sch->stats_lock, tca[TCA_RATE-1]);
566#endif
567 return 0;
568}
569
570struct check_loop_arg
571{
572 struct qdisc_walker w;
573 struct Qdisc *p;
574 int depth;
575};
576
577static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
578
579static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
580{
581 struct check_loop_arg arg;
582
583 if (q->ops->cl_ops == NULL)
584 return 0;
585
586 arg.w.stop = arg.w.skip = arg.w.count = 0;
587 arg.w.fn = check_loop_fn;
588 arg.depth = depth;
589 arg.p = p;
590 q->ops->cl_ops->walk(q, &arg.w);
591 return arg.w.stop ? -ELOOP : 0;
592}
593
594static int
595check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
596{
597 struct Qdisc *leaf;
598 struct Qdisc_class_ops *cops = q->ops->cl_ops;
599 struct check_loop_arg *arg = (struct check_loop_arg *)w;
600
601 leaf = cops->leaf(q, cl);
602 if (leaf) {
603 if (leaf == arg->p || arg->depth > 7)
604 return -ELOOP;
605 return check_loop(leaf, arg->p, arg->depth + 1);
606 }
607 return 0;
608}
609
610/*
611 * Delete/get qdisc.
612 */
613
614static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
615{
616 struct tcmsg *tcm = NLMSG_DATA(n);
617 struct rtattr **tca = arg;
618 struct net_device *dev;
619 u32 clid = tcm->tcm_parent;
620 struct Qdisc *q = NULL;
621 struct Qdisc *p = NULL;
622 int err;
623
624 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
625 return -ENODEV;
626
627 if (clid) {
628 if (clid != TC_H_ROOT) {
629 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
630 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
631 return -ENOENT;
632 q = qdisc_leaf(p, clid);
633 } else { /* ingress */
634 q = dev->qdisc_ingress;
10297b99 635 }
1da177e4
LT
636 } else {
637 q = dev->qdisc_sleeping;
638 }
639 if (!q)
640 return -ENOENT;
641
642 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
643 return -EINVAL;
644 } else {
645 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
646 return -ENOENT;
647 }
648
649 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
650 return -EINVAL;
651
652 if (n->nlmsg_type == RTM_DELQDISC) {
653 if (!clid)
654 return -EINVAL;
655 if (q->handle == 0)
656 return -ENOENT;
657 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
658 return err;
659 if (q) {
660 qdisc_notify(skb, n, clid, q, NULL);
661 spin_lock_bh(&dev->queue_lock);
662 qdisc_destroy(q);
663 spin_unlock_bh(&dev->queue_lock);
664 }
665 } else {
666 qdisc_notify(skb, n, clid, NULL, q);
667 }
668 return 0;
669}
670
671/*
672 Create/change qdisc.
673 */
674
675static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
676{
677 struct tcmsg *tcm;
678 struct rtattr **tca;
679 struct net_device *dev;
680 u32 clid;
681 struct Qdisc *q, *p;
682 int err;
683
684replay:
685 /* Reinit, just in case something touches this. */
686 tcm = NLMSG_DATA(n);
687 tca = arg;
688 clid = tcm->tcm_parent;
689 q = p = NULL;
690
691 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
692 return -ENODEV;
693
694 if (clid) {
695 if (clid != TC_H_ROOT) {
696 if (clid != TC_H_INGRESS) {
697 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
698 return -ENOENT;
699 q = qdisc_leaf(p, clid);
700 } else { /*ingress */
701 q = dev->qdisc_ingress;
702 }
703 } else {
704 q = dev->qdisc_sleeping;
705 }
706
707 /* It may be default qdisc, ignore it */
708 if (q && q->handle == 0)
709 q = NULL;
710
711 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
712 if (tcm->tcm_handle) {
713 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
714 return -EEXIST;
715 if (TC_H_MIN(tcm->tcm_handle))
716 return -EINVAL;
717 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
718 goto create_n_graft;
719 if (n->nlmsg_flags&NLM_F_EXCL)
720 return -EEXIST;
721 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
722 return -EINVAL;
723 if (q == p ||
724 (p && check_loop(q, p, 0)))
725 return -ELOOP;
726 atomic_inc(&q->refcnt);
727 goto graft;
728 } else {
729 if (q == NULL)
730 goto create_n_graft;
731
732 /* This magic test requires explanation.
733 *
734 * We know, that some child q is already
735 * attached to this parent and have choice:
736 * either to change it or to create/graft new one.
737 *
738 * 1. We are allowed to create/graft only
739 * if CREATE and REPLACE flags are set.
740 *
741 * 2. If EXCL is set, requestor wanted to say,
742 * that qdisc tcm_handle is not expected
743 * to exist, so that we choose create/graft too.
744 *
745 * 3. The last case is when no flags are set.
746 * Alas, it is sort of hole in API, we
747 * cannot decide what to do unambiguously.
748 * For now we select create/graft, if
749 * user gave KIND, which does not match existing.
750 */
751 if ((n->nlmsg_flags&NLM_F_CREATE) &&
752 (n->nlmsg_flags&NLM_F_REPLACE) &&
753 ((n->nlmsg_flags&NLM_F_EXCL) ||
754 (tca[TCA_KIND-1] &&
755 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
756 goto create_n_graft;
757 }
758 }
759 } else {
760 if (!tcm->tcm_handle)
761 return -EINVAL;
762 q = qdisc_lookup(dev, tcm->tcm_handle);
763 }
764
765 /* Change qdisc parameters */
766 if (q == NULL)
767 return -ENOENT;
768 if (n->nlmsg_flags&NLM_F_EXCL)
769 return -EEXIST;
770 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
771 return -EINVAL;
772 err = qdisc_change(q, tca);
773 if (err == 0)
774 qdisc_notify(skb, n, clid, NULL, q);
775 return err;
776
777create_n_graft:
778 if (!(n->nlmsg_flags&NLM_F_CREATE))
779 return -ENOENT;
780 if (clid == TC_H_INGRESS)
781 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
10297b99 782 else
1da177e4
LT
783 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
784 if (q == NULL) {
785 if (err == -EAGAIN)
786 goto replay;
787 return err;
788 }
789
790graft:
791 if (1) {
792 struct Qdisc *old_q = NULL;
793 err = qdisc_graft(dev, p, clid, q, &old_q);
794 if (err) {
795 if (q) {
796 spin_lock_bh(&dev->queue_lock);
797 qdisc_destroy(q);
798 spin_unlock_bh(&dev->queue_lock);
799 }
800 return err;
801 }
802 qdisc_notify(skb, n, clid, old_q, q);
803 if (old_q) {
804 spin_lock_bh(&dev->queue_lock);
805 qdisc_destroy(old_q);
806 spin_unlock_bh(&dev->queue_lock);
807 }
808 }
809 return 0;
810}
811
812static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 813 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
814{
815 struct tcmsg *tcm;
816 struct nlmsghdr *nlh;
27a884dc 817 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
818 struct gnet_dump d;
819
e431b8c0 820 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
821 tcm = NLMSG_DATA(nlh);
822 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
823 tcm->tcm__pad1 = 0;
824 tcm->tcm__pad2 = 0;
1da177e4
LT
825 tcm->tcm_ifindex = q->dev->ifindex;
826 tcm->tcm_parent = clid;
827 tcm->tcm_handle = q->handle;
828 tcm->tcm_info = atomic_read(&q->refcnt);
829 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
830 if (q->ops->dump && q->ops->dump(q, skb) < 0)
831 goto rtattr_failure;
832 q->qstats.qlen = q->q.qlen;
833
834 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
835 TCA_XSTATS, q->stats_lock, &d) < 0)
836 goto rtattr_failure;
837
838 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
839 goto rtattr_failure;
840
841 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
842#ifdef CONFIG_NET_ESTIMATOR
843 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
844#endif
845 gnet_stats_copy_queue(&d, &q->qstats) < 0)
846 goto rtattr_failure;
10297b99 847
1da177e4
LT
848 if (gnet_stats_finish_copy(&d) < 0)
849 goto rtattr_failure;
10297b99 850
27a884dc 851 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
852 return skb->len;
853
854nlmsg_failure:
855rtattr_failure:
dc5fc579 856 nlmsg_trim(skb, b);
1da177e4
LT
857 return -1;
858}
859
860static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
861 u32 clid, struct Qdisc *old, struct Qdisc *new)
862{
863 struct sk_buff *skb;
864 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
865
866 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
867 if (!skb)
868 return -ENOBUFS;
869
870 if (old && old->handle) {
871 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
872 goto err_out;
873 }
874 if (new) {
875 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
876 goto err_out;
877 }
878
879 if (skb->len)
ac6d439d 880 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
881
882err_out:
883 kfree_skb(skb);
884 return -EINVAL;
885}
886
887static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
888{
889 int idx, q_idx;
890 int s_idx, s_q_idx;
891 struct net_device *dev;
892 struct Qdisc *q;
893
894 s_idx = cb->args[0];
895 s_q_idx = q_idx = cb->args[1];
896 read_lock(&dev_base_lock);
897 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
898 if (idx < s_idx)
899 continue;
900 if (idx > s_idx)
901 s_q_idx = 0;
85670cc1 902 read_lock(&qdisc_tree_lock);
1da177e4
LT
903 q_idx = 0;
904 list_for_each_entry(q, &dev->qdisc_list, list) {
905 if (q_idx < s_q_idx) {
906 q_idx++;
907 continue;
908 }
909 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
910 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
85670cc1 911 read_unlock(&qdisc_tree_lock);
1da177e4
LT
912 goto done;
913 }
914 q_idx++;
915 }
85670cc1 916 read_unlock(&qdisc_tree_lock);
1da177e4
LT
917 }
918
919done:
920 read_unlock(&dev_base_lock);
921
922 cb->args[0] = idx;
923 cb->args[1] = q_idx;
924
925 return skb->len;
926}
927
928
929
930/************************************************
931 * Traffic classes manipulation. *
932 ************************************************/
933
934
935
936static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
937{
938 struct tcmsg *tcm = NLMSG_DATA(n);
939 struct rtattr **tca = arg;
940 struct net_device *dev;
941 struct Qdisc *q = NULL;
942 struct Qdisc_class_ops *cops;
943 unsigned long cl = 0;
944 unsigned long new_cl;
945 u32 pid = tcm->tcm_parent;
946 u32 clid = tcm->tcm_handle;
947 u32 qid = TC_H_MAJ(clid);
948 int err;
949
950 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
951 return -ENODEV;
952
953 /*
954 parent == TC_H_UNSPEC - unspecified parent.
955 parent == TC_H_ROOT - class is root, which has no parent.
956 parent == X:0 - parent is root class.
957 parent == X:Y - parent is a node in hierarchy.
958 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
959
960 handle == 0:0 - generate handle from kernel pool.
961 handle == 0:Y - class is X:Y, where X:0 is qdisc.
962 handle == X:Y - clear.
963 handle == X:0 - root class.
964 */
965
966 /* Step 1. Determine qdisc handle X:0 */
967
968 if (pid != TC_H_ROOT) {
969 u32 qid1 = TC_H_MAJ(pid);
970
971 if (qid && qid1) {
972 /* If both majors are known, they must be identical. */
973 if (qid != qid1)
974 return -EINVAL;
975 } else if (qid1) {
976 qid = qid1;
977 } else if (qid == 0)
978 qid = dev->qdisc_sleeping->handle;
979
980 /* Now qid is genuine qdisc handle consistent
981 both with parent and child.
982
983 TC_H_MAJ(pid) still may be unspecified, complete it now.
984 */
985 if (pid)
986 pid = TC_H_MAKE(qid, pid);
987 } else {
988 if (qid == 0)
989 qid = dev->qdisc_sleeping->handle;
990 }
991
992 /* OK. Locate qdisc */
10297b99 993 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
994 return -ENOENT;
995
996 /* An check that it supports classes */
997 cops = q->ops->cl_ops;
998 if (cops == NULL)
999 return -EINVAL;
1000
1001 /* Now try to get class */
1002 if (clid == 0) {
1003 if (pid == TC_H_ROOT)
1004 clid = qid;
1005 } else
1006 clid = TC_H_MAKE(qid, clid);
1007
1008 if (clid)
1009 cl = cops->get(q, clid);
1010
1011 if (cl == 0) {
1012 err = -ENOENT;
1013 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1014 goto out;
1015 } else {
1016 switch (n->nlmsg_type) {
10297b99 1017 case RTM_NEWTCLASS:
1da177e4
LT
1018 err = -EEXIST;
1019 if (n->nlmsg_flags&NLM_F_EXCL)
1020 goto out;
1021 break;
1022 case RTM_DELTCLASS:
1023 err = cops->delete(q, cl);
1024 if (err == 0)
1025 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1026 goto out;
1027 case RTM_GETTCLASS:
1028 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1029 goto out;
1030 default:
1031 err = -EINVAL;
1032 goto out;
1033 }
1034 }
1035
1036 new_cl = cl;
1037 err = cops->change(q, clid, pid, tca, &new_cl);
1038 if (err == 0)
1039 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1040
1041out:
1042 if (cl)
1043 cops->put(q, cl);
1044
1045 return err;
1046}
1047
1048
1049static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1050 unsigned long cl,
e431b8c0 1051 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1052{
1053 struct tcmsg *tcm;
1054 struct nlmsghdr *nlh;
27a884dc 1055 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1056 struct gnet_dump d;
1057 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1058
e431b8c0 1059 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1060 tcm = NLMSG_DATA(nlh);
1061 tcm->tcm_family = AF_UNSPEC;
1062 tcm->tcm_ifindex = q->dev->ifindex;
1063 tcm->tcm_parent = q->handle;
1064 tcm->tcm_handle = q->handle;
1065 tcm->tcm_info = 0;
1066 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1067 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1068 goto rtattr_failure;
1069
1070 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1071 TCA_XSTATS, q->stats_lock, &d) < 0)
1072 goto rtattr_failure;
1073
1074 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1075 goto rtattr_failure;
1076
1077 if (gnet_stats_finish_copy(&d) < 0)
1078 goto rtattr_failure;
1079
27a884dc 1080 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1081 return skb->len;
1082
1083nlmsg_failure:
1084rtattr_failure:
dc5fc579 1085 nlmsg_trim(skb, b);
1da177e4
LT
1086 return -1;
1087}
1088
1089static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1090 struct Qdisc *q, unsigned long cl, int event)
1091{
1092 struct sk_buff *skb;
1093 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1094
1095 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1096 if (!skb)
1097 return -ENOBUFS;
1098
1099 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1100 kfree_skb(skb);
1101 return -EINVAL;
1102 }
1103
ac6d439d 1104 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1105}
1106
1107struct qdisc_dump_args
1108{
1109 struct qdisc_walker w;
1110 struct sk_buff *skb;
1111 struct netlink_callback *cb;
1112};
1113
1114static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1115{
1116 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1117
1118 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1119 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1120}
1121
1122static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1123{
1124 int t;
1125 int s_t;
1126 struct net_device *dev;
1127 struct Qdisc *q;
1128 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1129 struct qdisc_dump_args arg;
1130
1131 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1132 return 0;
1133 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1134 return 0;
1135
1136 s_t = cb->args[0];
1137 t = 0;
1138
85670cc1 1139 read_lock(&qdisc_tree_lock);
1da177e4
LT
1140 list_for_each_entry(q, &dev->qdisc_list, list) {
1141 if (t < s_t || !q->ops->cl_ops ||
1142 (tcm->tcm_parent &&
1143 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1144 t++;
1145 continue;
1146 }
1147 if (t > s_t)
1148 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1149 arg.w.fn = qdisc_class_dump;
1150 arg.skb = skb;
1151 arg.cb = cb;
1152 arg.w.stop = 0;
1153 arg.w.skip = cb->args[1];
1154 arg.w.count = 0;
1155 q->ops->cl_ops->walk(q, &arg.w);
1156 cb->args[1] = arg.w.count;
1157 if (arg.w.stop)
1158 break;
1159 t++;
1160 }
85670cc1 1161 read_unlock(&qdisc_tree_lock);
1da177e4
LT
1162
1163 cb->args[0] = t;
1164
1165 dev_put(dev);
1166 return skb->len;
1167}
1168
1169/* Main classifier routine: scans classifier chain attached
1170 to this qdisc, (optionally) tests for protocol and asks
1171 specific classifiers.
1172 */
1173int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1174 struct tcf_result *res)
1175{
1176 int err = 0;
66c6f529 1177 __be16 protocol = skb->protocol;
1da177e4
LT
1178#ifdef CONFIG_NET_CLS_ACT
1179 struct tcf_proto *otp = tp;
1180reclassify:
1181#endif
1182 protocol = skb->protocol;
1183
1184 for ( ; tp; tp = tp->next) {
1185 if ((tp->protocol == protocol ||
b6d9bcb0 1186 tp->protocol == htons(ETH_P_ALL)) &&
1da177e4
LT
1187 (err = tp->classify(skb, tp, res)) >= 0) {
1188#ifdef CONFIG_NET_CLS_ACT
1189 if ( TC_ACT_RECLASSIFY == err) {
1190 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1191 tp = otp;
1192
1193 if (MAX_REC_LOOP < verd++) {
1194 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1195 tp->prio&0xffff, ntohs(tp->protocol));
1196 return TC_ACT_SHOT;
1197 }
1198 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1199 goto reclassify;
1200 } else {
10297b99 1201 if (skb->tc_verd)
1da177e4
LT
1202 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1203 return err;
1204 }
1205#else
1206
1207 return err;
1208#endif
1209 }
1210
1211 }
1212 return -1;
1213}
1214
1da177e4
LT
1215#ifdef CONFIG_PROC_FS
1216static int psched_show(struct seq_file *seq, void *v)
1217{
1218 seq_printf(seq, "%08x %08x %08x %08x\n",
641b9e0e 1219 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
514bca32
PM
1220 1000000,
1221 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES));
1da177e4
LT
1222
1223 return 0;
1224}
1225
1226static int psched_open(struct inode *inode, struct file *file)
1227{
1228 return single_open(file, psched_show, PDE(inode)->data);
1229}
1230
da7071d7 1231static const struct file_operations psched_fops = {
1da177e4
LT
1232 .owner = THIS_MODULE,
1233 .open = psched_open,
1234 .read = seq_read,
1235 .llseek = seq_lseek,
1236 .release = single_release,
10297b99 1237};
1da177e4
LT
1238#endif
1239
1da177e4
LT
1240static int __init pktsched_init(void)
1241{
1242 struct rtnetlink_link *link_p;
1243
1da177e4
LT
1244 link_p = rtnetlink_links[PF_UNSPEC];
1245
1246 /* Setup rtnetlink links. It is made here to avoid
1247 exporting large number of public symbols.
1248 */
1249
1250 if (link_p) {
1251 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1252 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1253 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1254 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1255 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1256 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1257 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1258 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1259 }
1260
1261 register_qdisc(&pfifo_qdisc_ops);
1262 register_qdisc(&bfifo_qdisc_ops);
1263 proc_net_fops_create("psched", 0, &psched_fops);
1264
1265 return 0;
1266}
1267
1268subsys_initcall(pktsched_init);
1269
1270EXPORT_SYMBOL(qdisc_get_rtab);
1271EXPORT_SYMBOL(qdisc_put_rtab);
1272EXPORT_SYMBOL(register_qdisc);
1273EXPORT_SYMBOL(unregister_qdisc);
1274EXPORT_SYMBOL(tc_classify);