[NET_SCHED]: turn PSCHED_GET_TIME into inline function
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4
LT
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
1da177e4
LT
30#include <linux/init.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/kmod.h>
34#include <linux/list.h>
35#include <linux/bitops.h>
4179477f 36#include <linux/hrtimer.h>
1da177e4 37
dc5fc579 38#include <net/netlink.h>
1da177e4
LT
39#include <net/sock.h>
40#include <net/pkt_sched.h>
41
42#include <asm/processor.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
50
51/*
52
53 Short review.
54 -------------
55
56 This file consists of two interrelated parts:
57
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
60
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
65
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
70
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
73
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
79
80 All real intelligent work is done inside qdisc modules.
81
82
83
84 Every discipline has two major routines: enqueue and dequeue.
85
86 ---dequeue
87
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
94
95 ---enqueue
96
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
106
107 Auxiliary routines:
108
109 ---requeue
110
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
113
114 ---reset
115
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
118
119 ---init
120
121 initializes newly created qdisc.
122
123 ---destroy
124
125 destroys resources allocated by init and during lifetime of qdisc.
126
127 ---change
128
129 changes qdisc parameters.
130 */
131
132/* Protects list of registered TC modules. It is pure SMP lock. */
133static DEFINE_RWLOCK(qdisc_mod_lock);
134
135
136/************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
139
140
141/* The list of all installed queueing disciplines. */
142
143static struct Qdisc_ops *qdisc_base;
144
145/* Register/uregister queueing discipline */
146
147int register_qdisc(struct Qdisc_ops *qops)
148{
149 struct Qdisc_ops *q, **qp;
150 int rc = -EEXIST;
151
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
155 goto out;
156
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
163
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
170}
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
176
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
185 }
186 write_unlock(&qdisc_mod_lock);
187 return err;
188}
189
190/* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
192 */
193
43effa1e 194static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4
LT
195{
196 struct Qdisc *q;
197
1da177e4 198 list_for_each_entry(q, &dev->qdisc_list, list) {
43effa1e 199 if (q->handle == handle)
1da177e4 200 return q;
1da177e4 201 }
1da177e4
LT
202 return NULL;
203}
204
43effa1e
PM
205struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
206{
207 struct Qdisc *q;
208
209 read_lock(&qdisc_tree_lock);
210 q = __qdisc_lookup(dev, handle);
211 read_unlock(&qdisc_tree_lock);
212 return q;
213}
214
1da177e4
LT
215static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
216{
217 unsigned long cl;
218 struct Qdisc *leaf;
219 struct Qdisc_class_ops *cops = p->ops->cl_ops;
220
221 if (cops == NULL)
222 return NULL;
223 cl = cops->get(p, classid);
224
225 if (cl == 0)
226 return NULL;
227 leaf = cops->leaf(p, cl);
228 cops->put(p, cl);
229 return leaf;
230}
231
232/* Find queueing discipline by name */
233
234static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
235{
236 struct Qdisc_ops *q = NULL;
237
238 if (kind) {
239 read_lock(&qdisc_mod_lock);
240 for (q = qdisc_base; q; q = q->next) {
241 if (rtattr_strcmp(kind, q->id) == 0) {
242 if (!try_module_get(q->owner))
243 q = NULL;
244 break;
245 }
246 }
247 read_unlock(&qdisc_mod_lock);
248 }
249 return q;
250}
251
252static struct qdisc_rate_table *qdisc_rtab_list;
253
254struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
255{
256 struct qdisc_rate_table *rtab;
257
258 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
259 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
260 rtab->refcnt++;
261 return rtab;
262 }
263 }
264
265 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
266 return NULL;
267
268 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
269 if (rtab) {
270 rtab->rate = *r;
271 rtab->refcnt = 1;
272 memcpy(rtab->data, RTA_DATA(tab), 1024);
273 rtab->next = qdisc_rtab_list;
274 qdisc_rtab_list = rtab;
275 }
276 return rtab;
277}
278
279void qdisc_put_rtab(struct qdisc_rate_table *tab)
280{
281 struct qdisc_rate_table *rtab, **rtabp;
282
283 if (!tab || --tab->refcnt)
284 return;
285
286 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
287 if (rtab == tab) {
288 *rtabp = rtab->next;
289 kfree(rtab);
290 return;
291 }
292 }
293}
294
4179477f
PM
295static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
296{
297 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
298 timer);
1936502d 299 struct net_device *dev = wd->qdisc->dev;
4179477f
PM
300
301 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
11274e5a 302 smp_wmb();
1936502d
SH
303 if (spin_trylock(&dev->queue_lock)) {
304 qdisc_run(dev);
305 spin_unlock(&dev->queue_lock);
306 } else
307 netif_schedule(dev);
308
4179477f
PM
309 return HRTIMER_NORESTART;
310}
311
312void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
313{
314 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
315 wd->timer.function = qdisc_watchdog;
316 wd->qdisc = qdisc;
317}
318EXPORT_SYMBOL(qdisc_watchdog_init);
319
320void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
321{
322 ktime_t time;
323
324 wd->qdisc->flags |= TCQ_F_THROTTLED;
11274e5a 325 smp_wmb();
4179477f
PM
326 time = ktime_set(0, 0);
327 time = ktime_add_ns(time, PSCHED_US2NS(expires));
328 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
329}
330EXPORT_SYMBOL(qdisc_watchdog_schedule);
331
332void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
333{
334 hrtimer_cancel(&wd->timer);
335 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
11274e5a 336 smp_wmb();
4179477f
PM
337}
338EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4
LT
339
340/* Allocate an unique handle from space managed by kernel */
341
342static u32 qdisc_alloc_handle(struct net_device *dev)
343{
344 int i = 0x10000;
345 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
346
347 do {
348 autohandle += TC_H_MAKE(0x10000U, 0);
349 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
350 autohandle = TC_H_MAKE(0x80000000U, 0);
351 } while (qdisc_lookup(dev, autohandle) && --i > 0);
352
353 return i>0 ? autohandle : 0;
354}
355
356/* Attach toplevel qdisc to device dev */
357
358static struct Qdisc *
359dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
360{
361 struct Qdisc *oqdisc;
362
363 if (dev->flags & IFF_UP)
364 dev_deactivate(dev);
365
366 qdisc_lock_tree(dev);
367 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
368 oqdisc = dev->qdisc_ingress;
369 /* Prune old scheduler */
370 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
371 /* delete */
372 qdisc_reset(oqdisc);
373 dev->qdisc_ingress = NULL;
374 } else { /* new */
375 dev->qdisc_ingress = qdisc;
376 }
377
378 } else {
379
380 oqdisc = dev->qdisc_sleeping;
381
382 /* Prune old scheduler */
383 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
384 qdisc_reset(oqdisc);
385
386 /* ... and graft new one */
387 if (qdisc == NULL)
388 qdisc = &noop_qdisc;
389 dev->qdisc_sleeping = qdisc;
390 dev->qdisc = &noop_qdisc;
391 }
392
393 qdisc_unlock_tree(dev);
394
395 if (dev->flags & IFF_UP)
396 dev_activate(dev);
397
398 return oqdisc;
399}
400
43effa1e
PM
401void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
402{
403 struct Qdisc_class_ops *cops;
404 unsigned long cl;
405 u32 parentid;
406
407 if (n == 0)
408 return;
409 while ((parentid = sch->parent)) {
410 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
411 cops = sch->ops->cl_ops;
412 if (cops->qlen_notify) {
413 cl = cops->get(sch, parentid);
414 cops->qlen_notify(sch, cl);
415 cops->put(sch, cl);
416 }
417 sch->q.qlen -= n;
418 }
419}
420EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4
LT
421
422/* Graft qdisc "new" to class "classid" of qdisc "parent" or
423 to device "dev".
424
425 Old qdisc is not destroyed but returned in *old.
426 */
427
428static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
429 u32 classid,
430 struct Qdisc *new, struct Qdisc **old)
431{
432 int err = 0;
433 struct Qdisc *q = *old;
434
435
10297b99 436 if (parent == NULL) {
1da177e4
LT
437 if (q && q->flags&TCQ_F_INGRESS) {
438 *old = dev_graft_qdisc(dev, q);
439 } else {
440 *old = dev_graft_qdisc(dev, new);
441 }
442 } else {
443 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
444
445 err = -EINVAL;
446
447 if (cops) {
448 unsigned long cl = cops->get(parent, classid);
449 if (cl) {
450 err = cops->graft(parent, cl, new, old);
451 if (new)
452 new->parent = classid;
453 cops->put(parent, cl);
454 }
455 }
456 }
457 return err;
458}
459
460/*
461 Allocate and initialize new qdisc.
462
463 Parameters are passed via opt.
464 */
465
466static struct Qdisc *
467qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
468{
469 int err;
470 struct rtattr *kind = tca[TCA_KIND-1];
1da177e4
LT
471 struct Qdisc *sch;
472 struct Qdisc_ops *ops;
1da177e4
LT
473
474 ops = qdisc_lookup_ops(kind);
475#ifdef CONFIG_KMOD
476 if (ops == NULL && kind != NULL) {
477 char name[IFNAMSIZ];
478 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
479 /* We dropped the RTNL semaphore in order to
480 * perform the module load. So, even if we
481 * succeeded in loading the module we have to
482 * tell the caller to replay the request. We
483 * indicate this using -EAGAIN.
484 * We replay the request because the device may
485 * go away in the mean time.
486 */
487 rtnl_unlock();
488 request_module("sch_%s", name);
489 rtnl_lock();
490 ops = qdisc_lookup_ops(kind);
491 if (ops != NULL) {
492 /* We will try again qdisc_lookup_ops,
493 * so don't keep a reference.
494 */
495 module_put(ops->owner);
496 err = -EAGAIN;
497 goto err_out;
498 }
499 }
500 }
501#endif
502
b9e2cc0f 503 err = -ENOENT;
1da177e4
LT
504 if (ops == NULL)
505 goto err_out;
506
3d54b82f
TG
507 sch = qdisc_alloc(dev, ops);
508 if (IS_ERR(sch)) {
509 err = PTR_ERR(sch);
1da177e4 510 goto err_out2;
3d54b82f 511 }
1da177e4 512
3d54b82f 513 if (handle == TC_H_INGRESS) {
1da177e4 514 sch->flags |= TCQ_F_INGRESS;
3d54b82f
TG
515 handle = TC_H_MAKE(TC_H_INGRESS, 0);
516 } else if (handle == 0) {
1da177e4
LT
517 handle = qdisc_alloc_handle(dev);
518 err = -ENOMEM;
519 if (handle == 0)
520 goto err_out3;
521 }
522
3d54b82f 523 sch->handle = handle;
1da177e4
LT
524
525 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
023e09a7
TG
526#ifdef CONFIG_NET_ESTIMATOR
527 if (tca[TCA_RATE-1]) {
528 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
529 sch->stats_lock,
530 tca[TCA_RATE-1]);
531 if (err) {
532 /*
533 * Any broken qdiscs that would require
534 * a ops->reset() here? The qdisc was never
535 * in action so it shouldn't be necessary.
536 */
537 if (ops->destroy)
538 ops->destroy(sch);
539 goto err_out3;
540 }
541 }
542#endif
1da177e4
LT
543 qdisc_lock_tree(dev);
544 list_add_tail(&sch->list, &dev->qdisc_list);
545 qdisc_unlock_tree(dev);
546
1da177e4
LT
547 return sch;
548 }
549err_out3:
550 dev_put(dev);
3d54b82f 551 kfree((char *) sch - sch->padded);
1da177e4
LT
552err_out2:
553 module_put(ops->owner);
554err_out:
555 *errp = err;
1da177e4
LT
556 return NULL;
557}
558
559static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
560{
561 if (tca[TCA_OPTIONS-1]) {
562 int err;
563
564 if (sch->ops->change == NULL)
565 return -EINVAL;
566 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
567 if (err)
568 return err;
569 }
570#ifdef CONFIG_NET_ESTIMATOR
571 if (tca[TCA_RATE-1])
572 gen_replace_estimator(&sch->bstats, &sch->rate_est,
573 sch->stats_lock, tca[TCA_RATE-1]);
574#endif
575 return 0;
576}
577
578struct check_loop_arg
579{
580 struct qdisc_walker w;
581 struct Qdisc *p;
582 int depth;
583};
584
585static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
586
587static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
588{
589 struct check_loop_arg arg;
590
591 if (q->ops->cl_ops == NULL)
592 return 0;
593
594 arg.w.stop = arg.w.skip = arg.w.count = 0;
595 arg.w.fn = check_loop_fn;
596 arg.depth = depth;
597 arg.p = p;
598 q->ops->cl_ops->walk(q, &arg.w);
599 return arg.w.stop ? -ELOOP : 0;
600}
601
602static int
603check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
604{
605 struct Qdisc *leaf;
606 struct Qdisc_class_ops *cops = q->ops->cl_ops;
607 struct check_loop_arg *arg = (struct check_loop_arg *)w;
608
609 leaf = cops->leaf(q, cl);
610 if (leaf) {
611 if (leaf == arg->p || arg->depth > 7)
612 return -ELOOP;
613 return check_loop(leaf, arg->p, arg->depth + 1);
614 }
615 return 0;
616}
617
618/*
619 * Delete/get qdisc.
620 */
621
622static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
623{
624 struct tcmsg *tcm = NLMSG_DATA(n);
625 struct rtattr **tca = arg;
626 struct net_device *dev;
627 u32 clid = tcm->tcm_parent;
628 struct Qdisc *q = NULL;
629 struct Qdisc *p = NULL;
630 int err;
631
632 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
633 return -ENODEV;
634
635 if (clid) {
636 if (clid != TC_H_ROOT) {
637 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
638 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
639 return -ENOENT;
640 q = qdisc_leaf(p, clid);
641 } else { /* ingress */
642 q = dev->qdisc_ingress;
10297b99 643 }
1da177e4
LT
644 } else {
645 q = dev->qdisc_sleeping;
646 }
647 if (!q)
648 return -ENOENT;
649
650 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
651 return -EINVAL;
652 } else {
653 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
654 return -ENOENT;
655 }
656
657 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
658 return -EINVAL;
659
660 if (n->nlmsg_type == RTM_DELQDISC) {
661 if (!clid)
662 return -EINVAL;
663 if (q->handle == 0)
664 return -ENOENT;
665 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
666 return err;
667 if (q) {
668 qdisc_notify(skb, n, clid, q, NULL);
669 spin_lock_bh(&dev->queue_lock);
670 qdisc_destroy(q);
671 spin_unlock_bh(&dev->queue_lock);
672 }
673 } else {
674 qdisc_notify(skb, n, clid, NULL, q);
675 }
676 return 0;
677}
678
679/*
680 Create/change qdisc.
681 */
682
683static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
684{
685 struct tcmsg *tcm;
686 struct rtattr **tca;
687 struct net_device *dev;
688 u32 clid;
689 struct Qdisc *q, *p;
690 int err;
691
692replay:
693 /* Reinit, just in case something touches this. */
694 tcm = NLMSG_DATA(n);
695 tca = arg;
696 clid = tcm->tcm_parent;
697 q = p = NULL;
698
699 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
700 return -ENODEV;
701
702 if (clid) {
703 if (clid != TC_H_ROOT) {
704 if (clid != TC_H_INGRESS) {
705 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
706 return -ENOENT;
707 q = qdisc_leaf(p, clid);
708 } else { /*ingress */
709 q = dev->qdisc_ingress;
710 }
711 } else {
712 q = dev->qdisc_sleeping;
713 }
714
715 /* It may be default qdisc, ignore it */
716 if (q && q->handle == 0)
717 q = NULL;
718
719 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
720 if (tcm->tcm_handle) {
721 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
722 return -EEXIST;
723 if (TC_H_MIN(tcm->tcm_handle))
724 return -EINVAL;
725 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
726 goto create_n_graft;
727 if (n->nlmsg_flags&NLM_F_EXCL)
728 return -EEXIST;
729 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
730 return -EINVAL;
731 if (q == p ||
732 (p && check_loop(q, p, 0)))
733 return -ELOOP;
734 atomic_inc(&q->refcnt);
735 goto graft;
736 } else {
737 if (q == NULL)
738 goto create_n_graft;
739
740 /* This magic test requires explanation.
741 *
742 * We know, that some child q is already
743 * attached to this parent and have choice:
744 * either to change it or to create/graft new one.
745 *
746 * 1. We are allowed to create/graft only
747 * if CREATE and REPLACE flags are set.
748 *
749 * 2. If EXCL is set, requestor wanted to say,
750 * that qdisc tcm_handle is not expected
751 * to exist, so that we choose create/graft too.
752 *
753 * 3. The last case is when no flags are set.
754 * Alas, it is sort of hole in API, we
755 * cannot decide what to do unambiguously.
756 * For now we select create/graft, if
757 * user gave KIND, which does not match existing.
758 */
759 if ((n->nlmsg_flags&NLM_F_CREATE) &&
760 (n->nlmsg_flags&NLM_F_REPLACE) &&
761 ((n->nlmsg_flags&NLM_F_EXCL) ||
762 (tca[TCA_KIND-1] &&
763 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
764 goto create_n_graft;
765 }
766 }
767 } else {
768 if (!tcm->tcm_handle)
769 return -EINVAL;
770 q = qdisc_lookup(dev, tcm->tcm_handle);
771 }
772
773 /* Change qdisc parameters */
774 if (q == NULL)
775 return -ENOENT;
776 if (n->nlmsg_flags&NLM_F_EXCL)
777 return -EEXIST;
778 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
779 return -EINVAL;
780 err = qdisc_change(q, tca);
781 if (err == 0)
782 qdisc_notify(skb, n, clid, NULL, q);
783 return err;
784
785create_n_graft:
786 if (!(n->nlmsg_flags&NLM_F_CREATE))
787 return -ENOENT;
788 if (clid == TC_H_INGRESS)
789 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
10297b99 790 else
1da177e4
LT
791 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
792 if (q == NULL) {
793 if (err == -EAGAIN)
794 goto replay;
795 return err;
796 }
797
798graft:
799 if (1) {
800 struct Qdisc *old_q = NULL;
801 err = qdisc_graft(dev, p, clid, q, &old_q);
802 if (err) {
803 if (q) {
804 spin_lock_bh(&dev->queue_lock);
805 qdisc_destroy(q);
806 spin_unlock_bh(&dev->queue_lock);
807 }
808 return err;
809 }
810 qdisc_notify(skb, n, clid, old_q, q);
811 if (old_q) {
812 spin_lock_bh(&dev->queue_lock);
813 qdisc_destroy(old_q);
814 spin_unlock_bh(&dev->queue_lock);
815 }
816 }
817 return 0;
818}
819
820static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 821 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
822{
823 struct tcmsg *tcm;
824 struct nlmsghdr *nlh;
27a884dc 825 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
826 struct gnet_dump d;
827
e431b8c0 828 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
829 tcm = NLMSG_DATA(nlh);
830 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
831 tcm->tcm__pad1 = 0;
832 tcm->tcm__pad2 = 0;
1da177e4
LT
833 tcm->tcm_ifindex = q->dev->ifindex;
834 tcm->tcm_parent = clid;
835 tcm->tcm_handle = q->handle;
836 tcm->tcm_info = atomic_read(&q->refcnt);
837 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
838 if (q->ops->dump && q->ops->dump(q, skb) < 0)
839 goto rtattr_failure;
840 q->qstats.qlen = q->q.qlen;
841
842 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
843 TCA_XSTATS, q->stats_lock, &d) < 0)
844 goto rtattr_failure;
845
846 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
847 goto rtattr_failure;
848
849 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
850#ifdef CONFIG_NET_ESTIMATOR
851 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
852#endif
853 gnet_stats_copy_queue(&d, &q->qstats) < 0)
854 goto rtattr_failure;
10297b99 855
1da177e4
LT
856 if (gnet_stats_finish_copy(&d) < 0)
857 goto rtattr_failure;
10297b99 858
27a884dc 859 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
860 return skb->len;
861
862nlmsg_failure:
863rtattr_failure:
dc5fc579 864 nlmsg_trim(skb, b);
1da177e4
LT
865 return -1;
866}
867
868static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
869 u32 clid, struct Qdisc *old, struct Qdisc *new)
870{
871 struct sk_buff *skb;
872 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
873
874 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
875 if (!skb)
876 return -ENOBUFS;
877
878 if (old && old->handle) {
879 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
880 goto err_out;
881 }
882 if (new) {
883 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
884 goto err_out;
885 }
886
887 if (skb->len)
ac6d439d 888 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
889
890err_out:
891 kfree_skb(skb);
892 return -EINVAL;
893}
894
895static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
896{
897 int idx, q_idx;
898 int s_idx, s_q_idx;
899 struct net_device *dev;
900 struct Qdisc *q;
901
902 s_idx = cb->args[0];
903 s_q_idx = q_idx = cb->args[1];
904 read_lock(&dev_base_lock);
905 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
906 if (idx < s_idx)
907 continue;
908 if (idx > s_idx)
909 s_q_idx = 0;
85670cc1 910 read_lock(&qdisc_tree_lock);
1da177e4
LT
911 q_idx = 0;
912 list_for_each_entry(q, &dev->qdisc_list, list) {
913 if (q_idx < s_q_idx) {
914 q_idx++;
915 continue;
916 }
917 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
918 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
85670cc1 919 read_unlock(&qdisc_tree_lock);
1da177e4
LT
920 goto done;
921 }
922 q_idx++;
923 }
85670cc1 924 read_unlock(&qdisc_tree_lock);
1da177e4
LT
925 }
926
927done:
928 read_unlock(&dev_base_lock);
929
930 cb->args[0] = idx;
931 cb->args[1] = q_idx;
932
933 return skb->len;
934}
935
936
937
938/************************************************
939 * Traffic classes manipulation. *
940 ************************************************/
941
942
943
944static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
945{
946 struct tcmsg *tcm = NLMSG_DATA(n);
947 struct rtattr **tca = arg;
948 struct net_device *dev;
949 struct Qdisc *q = NULL;
950 struct Qdisc_class_ops *cops;
951 unsigned long cl = 0;
952 unsigned long new_cl;
953 u32 pid = tcm->tcm_parent;
954 u32 clid = tcm->tcm_handle;
955 u32 qid = TC_H_MAJ(clid);
956 int err;
957
958 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
959 return -ENODEV;
960
961 /*
962 parent == TC_H_UNSPEC - unspecified parent.
963 parent == TC_H_ROOT - class is root, which has no parent.
964 parent == X:0 - parent is root class.
965 parent == X:Y - parent is a node in hierarchy.
966 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
967
968 handle == 0:0 - generate handle from kernel pool.
969 handle == 0:Y - class is X:Y, where X:0 is qdisc.
970 handle == X:Y - clear.
971 handle == X:0 - root class.
972 */
973
974 /* Step 1. Determine qdisc handle X:0 */
975
976 if (pid != TC_H_ROOT) {
977 u32 qid1 = TC_H_MAJ(pid);
978
979 if (qid && qid1) {
980 /* If both majors are known, they must be identical. */
981 if (qid != qid1)
982 return -EINVAL;
983 } else if (qid1) {
984 qid = qid1;
985 } else if (qid == 0)
986 qid = dev->qdisc_sleeping->handle;
987
988 /* Now qid is genuine qdisc handle consistent
989 both with parent and child.
990
991 TC_H_MAJ(pid) still may be unspecified, complete it now.
992 */
993 if (pid)
994 pid = TC_H_MAKE(qid, pid);
995 } else {
996 if (qid == 0)
997 qid = dev->qdisc_sleeping->handle;
998 }
999
1000 /* OK. Locate qdisc */
10297b99 1001 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
1002 return -ENOENT;
1003
1004 /* An check that it supports classes */
1005 cops = q->ops->cl_ops;
1006 if (cops == NULL)
1007 return -EINVAL;
1008
1009 /* Now try to get class */
1010 if (clid == 0) {
1011 if (pid == TC_H_ROOT)
1012 clid = qid;
1013 } else
1014 clid = TC_H_MAKE(qid, clid);
1015
1016 if (clid)
1017 cl = cops->get(q, clid);
1018
1019 if (cl == 0) {
1020 err = -ENOENT;
1021 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1022 goto out;
1023 } else {
1024 switch (n->nlmsg_type) {
10297b99 1025 case RTM_NEWTCLASS:
1da177e4
LT
1026 err = -EEXIST;
1027 if (n->nlmsg_flags&NLM_F_EXCL)
1028 goto out;
1029 break;
1030 case RTM_DELTCLASS:
1031 err = cops->delete(q, cl);
1032 if (err == 0)
1033 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1034 goto out;
1035 case RTM_GETTCLASS:
1036 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1037 goto out;
1038 default:
1039 err = -EINVAL;
1040 goto out;
1041 }
1042 }
1043
1044 new_cl = cl;
1045 err = cops->change(q, clid, pid, tca, &new_cl);
1046 if (err == 0)
1047 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1048
1049out:
1050 if (cl)
1051 cops->put(q, cl);
1052
1053 return err;
1054}
1055
1056
1057static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1058 unsigned long cl,
e431b8c0 1059 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1060{
1061 struct tcmsg *tcm;
1062 struct nlmsghdr *nlh;
27a884dc 1063 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1064 struct gnet_dump d;
1065 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1066
e431b8c0 1067 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1068 tcm = NLMSG_DATA(nlh);
1069 tcm->tcm_family = AF_UNSPEC;
1070 tcm->tcm_ifindex = q->dev->ifindex;
1071 tcm->tcm_parent = q->handle;
1072 tcm->tcm_handle = q->handle;
1073 tcm->tcm_info = 0;
1074 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1075 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1076 goto rtattr_failure;
1077
1078 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1079 TCA_XSTATS, q->stats_lock, &d) < 0)
1080 goto rtattr_failure;
1081
1082 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1083 goto rtattr_failure;
1084
1085 if (gnet_stats_finish_copy(&d) < 0)
1086 goto rtattr_failure;
1087
27a884dc 1088 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1089 return skb->len;
1090
1091nlmsg_failure:
1092rtattr_failure:
dc5fc579 1093 nlmsg_trim(skb, b);
1da177e4
LT
1094 return -1;
1095}
1096
1097static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1098 struct Qdisc *q, unsigned long cl, int event)
1099{
1100 struct sk_buff *skb;
1101 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1102
1103 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1104 if (!skb)
1105 return -ENOBUFS;
1106
1107 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1108 kfree_skb(skb);
1109 return -EINVAL;
1110 }
1111
ac6d439d 1112 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1113}
1114
1115struct qdisc_dump_args
1116{
1117 struct qdisc_walker w;
1118 struct sk_buff *skb;
1119 struct netlink_callback *cb;
1120};
1121
1122static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1123{
1124 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1125
1126 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1127 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1128}
1129
1130static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1131{
1132 int t;
1133 int s_t;
1134 struct net_device *dev;
1135 struct Qdisc *q;
1136 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1137 struct qdisc_dump_args arg;
1138
1139 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1140 return 0;
1141 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1142 return 0;
1143
1144 s_t = cb->args[0];
1145 t = 0;
1146
85670cc1 1147 read_lock(&qdisc_tree_lock);
1da177e4
LT
1148 list_for_each_entry(q, &dev->qdisc_list, list) {
1149 if (t < s_t || !q->ops->cl_ops ||
1150 (tcm->tcm_parent &&
1151 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1152 t++;
1153 continue;
1154 }
1155 if (t > s_t)
1156 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1157 arg.w.fn = qdisc_class_dump;
1158 arg.skb = skb;
1159 arg.cb = cb;
1160 arg.w.stop = 0;
1161 arg.w.skip = cb->args[1];
1162 arg.w.count = 0;
1163 q->ops->cl_ops->walk(q, &arg.w);
1164 cb->args[1] = arg.w.count;
1165 if (arg.w.stop)
1166 break;
1167 t++;
1168 }
85670cc1 1169 read_unlock(&qdisc_tree_lock);
1da177e4
LT
1170
1171 cb->args[0] = t;
1172
1173 dev_put(dev);
1174 return skb->len;
1175}
1176
1177/* Main classifier routine: scans classifier chain attached
1178 to this qdisc, (optionally) tests for protocol and asks
1179 specific classifiers.
1180 */
1181int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1182 struct tcf_result *res)
1183{
1184 int err = 0;
66c6f529 1185 __be16 protocol = skb->protocol;
1da177e4
LT
1186#ifdef CONFIG_NET_CLS_ACT
1187 struct tcf_proto *otp = tp;
1188reclassify:
1189#endif
1190 protocol = skb->protocol;
1191
1192 for ( ; tp; tp = tp->next) {
1193 if ((tp->protocol == protocol ||
b6d9bcb0 1194 tp->protocol == htons(ETH_P_ALL)) &&
1da177e4
LT
1195 (err = tp->classify(skb, tp, res)) >= 0) {
1196#ifdef CONFIG_NET_CLS_ACT
1197 if ( TC_ACT_RECLASSIFY == err) {
1198 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1199 tp = otp;
1200
1201 if (MAX_REC_LOOP < verd++) {
1202 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1203 tp->prio&0xffff, ntohs(tp->protocol));
1204 return TC_ACT_SHOT;
1205 }
1206 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1207 goto reclassify;
1208 } else {
10297b99 1209 if (skb->tc_verd)
1da177e4
LT
1210 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1211 return err;
1212 }
1213#else
1214
1215 return err;
1216#endif
1217 }
1218
1219 }
1220 return -1;
1221}
1222
1da177e4
LT
1223#ifdef CONFIG_PROC_FS
1224static int psched_show(struct seq_file *seq, void *v)
1225{
1226 seq_printf(seq, "%08x %08x %08x %08x\n",
641b9e0e 1227 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
514bca32
PM
1228 1000000,
1229 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES));
1da177e4
LT
1230
1231 return 0;
1232}
1233
1234static int psched_open(struct inode *inode, struct file *file)
1235{
1236 return single_open(file, psched_show, PDE(inode)->data);
1237}
1238
da7071d7 1239static const struct file_operations psched_fops = {
1da177e4
LT
1240 .owner = THIS_MODULE,
1241 .open = psched_open,
1242 .read = seq_read,
1243 .llseek = seq_lseek,
1244 .release = single_release,
10297b99 1245};
1da177e4
LT
1246#endif
1247
1da177e4
LT
1248static int __init pktsched_init(void)
1249{
1da177e4
LT
1250 register_qdisc(&pfifo_qdisc_ops);
1251 register_qdisc(&bfifo_qdisc_ops);
1252 proc_net_fops_create("psched", 0, &psched_fops);
1253
be577ddc
TG
1254 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1255 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1256 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1257 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1258 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1259 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1260
1da177e4
LT
1261 return 0;
1262}
1263
1264subsys_initcall(pktsched_init);
1265
1266EXPORT_SYMBOL(qdisc_get_rtab);
1267EXPORT_SYMBOL(qdisc_put_rtab);
1268EXPORT_SYMBOL(register_qdisc);
1269EXPORT_SYMBOL(unregister_qdisc);
1270EXPORT_SYMBOL(tc_classify);