Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4 21#include <linux/string.h>
1da177e4 22#include <linux/errno.h>
1da177e4 23#include <linux/skbuff.h>
1da177e4
LT
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
4179477f 29#include <linux/hrtimer.h>
25bfcd5a 30#include <linux/lockdep.h>
5a0e3ad6 31#include <linux/slab.h>
1da177e4 32
457c4cbc 33#include <net/net_namespace.h>
b854272b 34#include <net/sock.h>
dc5fc579 35#include <net/netlink.h>
1da177e4
LT
36#include <net/pkt_sched.h>
37
7316ae88
TG
38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39 struct nlmsghdr *n, u32 clid,
1da177e4 40 struct Qdisc *old, struct Qdisc *new);
7316ae88
TG
41static int tclass_notify(struct net *net, struct sk_buff *oskb,
42 struct nlmsghdr *n, struct Qdisc *q,
43 unsigned long cl, int event);
1da177e4
LT
44
45/*
46
47 Short review.
48 -------------
49
50 This file consists of two interrelated parts:
51
52 1. queueing disciplines manager frontend.
53 2. traffic classes manager frontend.
54
55 Generally, queueing discipline ("qdisc") is a black box,
56 which is able to enqueue packets and to dequeue them (when
57 device is ready to send something) in order and at times
58 determined by algorithm hidden in it.
59
60 qdisc's are divided to two categories:
61 - "queues", which have no internal structure visible from outside.
62 - "schedulers", which split all the packets to "traffic classes",
63 using "packet classifiers" (look at cls_api.c)
64
65 In turn, classes may have child qdiscs (as rule, queues)
66 attached to them etc. etc. etc.
67
68 The goal of the routines in this file is to translate
69 information supplied by user in the form of handles
70 to more intelligible for kernel form, to make some sanity
71 checks and part of work, which is common to all qdiscs
72 and to provide rtnetlink notifications.
73
74 All real intelligent work is done inside qdisc modules.
75
76
77
78 Every discipline has two major routines: enqueue and dequeue.
79
80 ---dequeue
81
82 dequeue usually returns a skb to send. It is allowed to return NULL,
83 but it does not mean that queue is empty, it just means that
84 discipline does not want to send anything this time.
85 Queue is really empty if q->q.qlen == 0.
86 For complicated disciplines with multiple queues q->q is not
87 real packet queue, but however q->q.qlen must be valid.
88
89 ---enqueue
90
91 enqueue returns 0, if packet was enqueued successfully.
92 If packet (this one or another one) was dropped, it returns
93 not zero error code.
94 NET_XMIT_DROP - this packet dropped
95 Expected action: do not backoff, but wait until queue will clear.
96 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
97 Expected action: backoff or ignore
98 NET_XMIT_POLICED - dropped by police.
99 Expected action: backoff or error to real-time apps.
100
101 Auxiliary routines:
102
99c0db26
JP
103 ---peek
104
105 like dequeue but without removing a packet from the queue
106
1da177e4
LT
107 ---reset
108
109 returns qdisc to initial state: purge all buffers, clear all
110 timers, counters (except for statistics) etc.
111
112 ---init
113
114 initializes newly created qdisc.
115
116 ---destroy
117
118 destroys resources allocated by init and during lifetime of qdisc.
119
120 ---change
121
122 changes qdisc parameters.
123 */
124
125/* Protects list of registered TC modules. It is pure SMP lock. */
126static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129/************************************************
130 * Queueing disciplines manipulation. *
131 ************************************************/
132
133
134/* The list of all installed queueing disciplines. */
135
136static struct Qdisc_ops *qdisc_base;
137
138/* Register/uregister queueing discipline */
139
140int register_qdisc(struct Qdisc_ops *qops)
141{
142 struct Qdisc_ops *q, **qp;
143 int rc = -EEXIST;
144
145 write_lock(&qdisc_mod_lock);
146 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147 if (!strcmp(qops->id, q->id))
148 goto out;
149
150 if (qops->enqueue == NULL)
151 qops->enqueue = noop_qdisc_ops.enqueue;
99c0db26
JP
152 if (qops->peek == NULL) {
153 if (qops->dequeue == NULL) {
154 qops->peek = noop_qdisc_ops.peek;
155 } else {
156 rc = -EINVAL;
157 goto out;
158 }
159 }
1da177e4
LT
160 if (qops->dequeue == NULL)
161 qops->dequeue = noop_qdisc_ops.dequeue;
162
163 qops->next = NULL;
164 *qp = qops;
165 rc = 0;
166out:
167 write_unlock(&qdisc_mod_lock);
168 return rc;
169}
62e3ba1b 170EXPORT_SYMBOL(register_qdisc);
1da177e4
LT
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
176
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
185 }
186 write_unlock(&qdisc_mod_lock);
187 return err;
188}
62e3ba1b 189EXPORT_SYMBOL(unregister_qdisc);
1da177e4
LT
190
191/* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
193 */
194
6113b748 195static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
8123b421
DM
196{
197 struct Qdisc *q;
198
199 if (!(root->flags & TCQ_F_BUILTIN) &&
200 root->handle == handle)
201 return root;
202
203 list_for_each_entry(q, &root->list, list) {
204 if (q->handle == handle)
205 return q;
206 }
207 return NULL;
208}
209
f6e0b239
JP
210static void qdisc_list_add(struct Qdisc *q)
211{
f6486d40 212 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
af356afa 213 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
f6e0b239
JP
214}
215
216void qdisc_list_del(struct Qdisc *q)
217{
f6486d40 218 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
f6e0b239 219 list_del(&q->list);
f6e0b239
JP
220}
221EXPORT_SYMBOL(qdisc_list_del);
222
ead81cc5 223struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4 224{
f6e0b239
JP
225 struct Qdisc *q;
226
af356afa
PM
227 q = qdisc_match_from_root(dev->qdisc, handle);
228 if (q)
229 goto out;
f6e0b239
JP
230
231 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
f6486d40 232out:
f6e0b239 233 return q;
1da177e4
LT
234}
235
236static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
237{
238 unsigned long cl;
239 struct Qdisc *leaf;
20fea08b 240 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
1da177e4
LT
241
242 if (cops == NULL)
243 return NULL;
244 cl = cops->get(p, classid);
245
246 if (cl == 0)
247 return NULL;
248 leaf = cops->leaf(p, cl);
249 cops->put(p, cl);
250 return leaf;
251}
252
253/* Find queueing discipline by name */
254
1e90474c 255static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
1da177e4
LT
256{
257 struct Qdisc_ops *q = NULL;
258
259 if (kind) {
260 read_lock(&qdisc_mod_lock);
261 for (q = qdisc_base; q; q = q->next) {
1e90474c 262 if (nla_strcmp(kind, q->id) == 0) {
1da177e4
LT
263 if (!try_module_get(q->owner))
264 q = NULL;
265 break;
266 }
267 }
268 read_unlock(&qdisc_mod_lock);
269 }
270 return q;
271}
272
273static struct qdisc_rate_table *qdisc_rtab_list;
274
1e90474c 275struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
1da177e4
LT
276{
277 struct qdisc_rate_table *rtab;
278
279 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
280 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
281 rtab->refcnt++;
282 return rtab;
283 }
284 }
285
5feb5e1a
PM
286 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
287 nla_len(tab) != TC_RTAB_SIZE)
1da177e4
LT
288 return NULL;
289
290 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
291 if (rtab) {
292 rtab->rate = *r;
293 rtab->refcnt = 1;
1e90474c 294 memcpy(rtab->data, nla_data(tab), 1024);
1da177e4
LT
295 rtab->next = qdisc_rtab_list;
296 qdisc_rtab_list = rtab;
297 }
298 return rtab;
299}
62e3ba1b 300EXPORT_SYMBOL(qdisc_get_rtab);
1da177e4
LT
301
302void qdisc_put_rtab(struct qdisc_rate_table *tab)
303{
304 struct qdisc_rate_table *rtab, **rtabp;
305
306 if (!tab || --tab->refcnt)
307 return;
308
309 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
310 if (rtab == tab) {
311 *rtabp = rtab->next;
312 kfree(rtab);
313 return;
314 }
315 }
316}
62e3ba1b 317EXPORT_SYMBOL(qdisc_put_rtab);
1da177e4 318
175f9c1b
JK
319static LIST_HEAD(qdisc_stab_list);
320static DEFINE_SPINLOCK(qdisc_stab_lock);
321
322static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
323 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
324 [TCA_STAB_DATA] = { .type = NLA_BINARY },
325};
326
327static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
328{
329 struct nlattr *tb[TCA_STAB_MAX + 1];
330 struct qdisc_size_table *stab;
331 struct tc_sizespec *s;
332 unsigned int tsize = 0;
333 u16 *tab = NULL;
334 int err;
335
336 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
337 if (err < 0)
338 return ERR_PTR(err);
339 if (!tb[TCA_STAB_BASE])
340 return ERR_PTR(-EINVAL);
341
342 s = nla_data(tb[TCA_STAB_BASE]);
343
344 if (s->tsize > 0) {
345 if (!tb[TCA_STAB_DATA])
346 return ERR_PTR(-EINVAL);
347 tab = nla_data(tb[TCA_STAB_DATA]);
348 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
349 }
350
351 if (!s || tsize != s->tsize || (!tab && tsize > 0))
352 return ERR_PTR(-EINVAL);
353
f3b9605d 354 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
355
356 list_for_each_entry(stab, &qdisc_stab_list, list) {
357 if (memcmp(&stab->szopts, s, sizeof(*s)))
358 continue;
359 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
360 continue;
361 stab->refcnt++;
f3b9605d 362 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
363 return stab;
364 }
365
f3b9605d 366 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
367
368 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
369 if (!stab)
370 return ERR_PTR(-ENOMEM);
371
372 stab->refcnt = 1;
373 stab->szopts = *s;
374 if (tsize > 0)
375 memcpy(stab->data, tab, tsize * sizeof(u16));
376
f3b9605d 377 spin_lock(&qdisc_stab_lock);
175f9c1b 378 list_add_tail(&stab->list, &qdisc_stab_list);
f3b9605d 379 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
380
381 return stab;
382}
383
384void qdisc_put_stab(struct qdisc_size_table *tab)
385{
386 if (!tab)
387 return;
388
f3b9605d 389 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
390
391 if (--tab->refcnt == 0) {
392 list_del(&tab->list);
393 kfree(tab);
394 }
395
f3b9605d 396 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
397}
398EXPORT_SYMBOL(qdisc_put_stab);
399
400static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
401{
402 struct nlattr *nest;
403
404 nest = nla_nest_start(skb, TCA_STAB);
3aa4614d
PM
405 if (nest == NULL)
406 goto nla_put_failure;
175f9c1b
JK
407 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
408 nla_nest_end(skb, nest);
409
410 return skb->len;
411
412nla_put_failure:
413 return -1;
414}
415
416void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
417{
418 int pkt_len, slot;
419
420 pkt_len = skb->len + stab->szopts.overhead;
421 if (unlikely(!stab->szopts.tsize))
422 goto out;
423
424 slot = pkt_len + stab->szopts.cell_align;
425 if (unlikely(slot < 0))
426 slot = 0;
427
428 slot >>= stab->szopts.cell_log;
429 if (likely(slot < stab->szopts.tsize))
430 pkt_len = stab->data[slot];
431 else
432 pkt_len = stab->data[stab->szopts.tsize - 1] *
433 (slot / stab->szopts.tsize) +
434 stab->data[slot % stab->szopts.tsize];
435
436 pkt_len <<= stab->szopts.size_log;
437out:
438 if (unlikely(pkt_len < 1))
439 pkt_len = 1;
440 qdisc_skb_cb(skb)->pkt_len = pkt_len;
441}
442EXPORT_SYMBOL(qdisc_calculate_pkt_len);
443
b00355db
JP
444void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
445{
446 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
447 printk(KERN_WARNING
448 "%s: %s qdisc %X: is non-work-conserving?\n",
449 txt, qdisc->ops->id, qdisc->handle >> 16);
450 qdisc->flags |= TCQ_F_WARN_NONWC;
451 }
452}
453EXPORT_SYMBOL(qdisc_warn_nonwc);
454
4179477f
PM
455static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
456{
457 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
2fbd3da3 458 timer);
4179477f
PM
459
460 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
8608db03 461 __netif_schedule(qdisc_root(wd->qdisc));
1936502d 462
4179477f
PM
463 return HRTIMER_NORESTART;
464}
465
466void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
467{
2fbd3da3
DM
468 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
469 wd->timer.function = qdisc_watchdog;
4179477f
PM
470 wd->qdisc = qdisc;
471}
472EXPORT_SYMBOL(qdisc_watchdog_init);
473
474void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
475{
476 ktime_t time;
477
2540e051
JP
478 if (test_bit(__QDISC_STATE_DEACTIVATED,
479 &qdisc_root_sleeping(wd->qdisc)->state))
480 return;
481
4179477f
PM
482 wd->qdisc->flags |= TCQ_F_THROTTLED;
483 time = ktime_set(0, 0);
ca44d6e6 484 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
2fbd3da3 485 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
4179477f
PM
486}
487EXPORT_SYMBOL(qdisc_watchdog_schedule);
488
489void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
490{
2fbd3da3 491 hrtimer_cancel(&wd->timer);
4179477f
PM
492 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
493}
494EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4 495
a94f779f 496static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
6fe1c7a5
PM
497{
498 unsigned int size = n * sizeof(struct hlist_head), i;
499 struct hlist_head *h;
500
501 if (size <= PAGE_SIZE)
502 h = kmalloc(size, GFP_KERNEL);
503 else
504 h = (struct hlist_head *)
505 __get_free_pages(GFP_KERNEL, get_order(size));
506
507 if (h != NULL) {
508 for (i = 0; i < n; i++)
509 INIT_HLIST_HEAD(&h[i]);
510 }
511 return h;
512}
513
514static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
515{
516 unsigned int size = n * sizeof(struct hlist_head);
517
518 if (size <= PAGE_SIZE)
519 kfree(h);
520 else
521 free_pages((unsigned long)h, get_order(size));
522}
523
524void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
525{
526 struct Qdisc_class_common *cl;
527 struct hlist_node *n, *next;
528 struct hlist_head *nhash, *ohash;
529 unsigned int nsize, nmask, osize;
530 unsigned int i, h;
531
532 /* Rehash when load factor exceeds 0.75 */
533 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
534 return;
535 nsize = clhash->hashsize * 2;
536 nmask = nsize - 1;
537 nhash = qdisc_class_hash_alloc(nsize);
538 if (nhash == NULL)
539 return;
540
541 ohash = clhash->hash;
542 osize = clhash->hashsize;
543
544 sch_tree_lock(sch);
545 for (i = 0; i < osize; i++) {
546 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
547 h = qdisc_class_hash(cl->classid, nmask);
548 hlist_add_head(&cl->hnode, &nhash[h]);
549 }
550 }
551 clhash->hash = nhash;
552 clhash->hashsize = nsize;
553 clhash->hashmask = nmask;
554 sch_tree_unlock(sch);
555
556 qdisc_class_hash_free(ohash, osize);
557}
558EXPORT_SYMBOL(qdisc_class_hash_grow);
559
560int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
561{
562 unsigned int size = 4;
563
564 clhash->hash = qdisc_class_hash_alloc(size);
565 if (clhash->hash == NULL)
566 return -ENOMEM;
567 clhash->hashsize = size;
568 clhash->hashmask = size - 1;
569 clhash->hashelems = 0;
570 return 0;
571}
572EXPORT_SYMBOL(qdisc_class_hash_init);
573
574void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
575{
576 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
577}
578EXPORT_SYMBOL(qdisc_class_hash_destroy);
579
580void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
581 struct Qdisc_class_common *cl)
582{
583 unsigned int h;
584
585 INIT_HLIST_NODE(&cl->hnode);
586 h = qdisc_class_hash(cl->classid, clhash->hashmask);
587 hlist_add_head(&cl->hnode, &clhash->hash[h]);
588 clhash->hashelems++;
589}
590EXPORT_SYMBOL(qdisc_class_hash_insert);
591
592void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
593 struct Qdisc_class_common *cl)
594{
595 hlist_del(&cl->hnode);
596 clhash->hashelems--;
597}
598EXPORT_SYMBOL(qdisc_class_hash_remove);
599
1da177e4
LT
600/* Allocate an unique handle from space managed by kernel */
601
602static u32 qdisc_alloc_handle(struct net_device *dev)
603{
604 int i = 0x10000;
605 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
606
607 do {
608 autohandle += TC_H_MAKE(0x10000U, 0);
609 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
610 autohandle = TC_H_MAKE(0x80000000U, 0);
611 } while (qdisc_lookup(dev, autohandle) && --i > 0);
612
613 return i>0 ? autohandle : 0;
614}
615
43effa1e
PM
616void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
617{
20fea08b 618 const struct Qdisc_class_ops *cops;
43effa1e
PM
619 unsigned long cl;
620 u32 parentid;
621
622 if (n == 0)
623 return;
624 while ((parentid = sch->parent)) {
066a3b5b
JP
625 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
626 return;
627
5ce2d488 628 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
ffc8fefa
PM
629 if (sch == NULL) {
630 WARN_ON(parentid != TC_H_ROOT);
631 return;
632 }
43effa1e
PM
633 cops = sch->ops->cl_ops;
634 if (cops->qlen_notify) {
635 cl = cops->get(sch, parentid);
636 cops->qlen_notify(sch, cl);
637 cops->put(sch, cl);
638 }
639 sch->q.qlen -= n;
640 }
641}
642EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4 643
7316ae88
TG
644static void notify_and_destroy(struct net *net, struct sk_buff *skb,
645 struct nlmsghdr *n, u32 clid,
99194cff
DM
646 struct Qdisc *old, struct Qdisc *new)
647{
648 if (new || old)
7316ae88 649 qdisc_notify(net, skb, n, clid, old, new);
1da177e4 650
4d8863a2 651 if (old)
99194cff 652 qdisc_destroy(old);
99194cff
DM
653}
654
655/* Graft qdisc "new" to class "classid" of qdisc "parent" or
656 * to device "dev".
657 *
658 * When appropriate send a netlink notification using 'skb'
659 * and "n".
660 *
661 * On success, destroy old qdisc.
1da177e4
LT
662 */
663
664static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
99194cff
DM
665 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
666 struct Qdisc *new, struct Qdisc *old)
1da177e4 667{
99194cff 668 struct Qdisc *q = old;
7316ae88 669 struct net *net = dev_net(dev);
1da177e4 670 int err = 0;
1da177e4 671
10297b99 672 if (parent == NULL) {
99194cff
DM
673 unsigned int i, num_q, ingress;
674
675 ingress = 0;
676 num_q = dev->num_tx_queues;
8d50b53d
DM
677 if ((q && q->flags & TCQ_F_INGRESS) ||
678 (new && new->flags & TCQ_F_INGRESS)) {
99194cff
DM
679 num_q = 1;
680 ingress = 1;
681 }
682
683 if (dev->flags & IFF_UP)
684 dev_deactivate(dev);
685
6ec1c69a
DM
686 if (new && new->ops->attach) {
687 new->ops->attach(new);
688 num_q = 0;
689 }
690
99194cff
DM
691 for (i = 0; i < num_q; i++) {
692 struct netdev_queue *dev_queue = &dev->rx_queue;
693
694 if (!ingress)
695 dev_queue = netdev_get_tx_queue(dev, i);
696
8d50b53d
DM
697 old = dev_graft_qdisc(dev_queue, new);
698 if (new && i > 0)
699 atomic_inc(&new->refcnt);
700
036d6a67
JP
701 if (!ingress)
702 qdisc_destroy(old);
1da177e4 703 }
99194cff 704
036d6a67 705 if (!ingress) {
7316ae88
TG
706 notify_and_destroy(net, skb, n, classid,
707 dev->qdisc, new);
036d6a67
JP
708 if (new && !new->ops->attach)
709 atomic_inc(&new->refcnt);
710 dev->qdisc = new ? : &noop_qdisc;
711 } else {
7316ae88 712 notify_and_destroy(net, skb, n, classid, old, new);
036d6a67 713 }
af356afa 714
99194cff
DM
715 if (dev->flags & IFF_UP)
716 dev_activate(dev);
1da177e4 717 } else {
20fea08b 718 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1da177e4 719
c9f1d038
PM
720 err = -EOPNOTSUPP;
721 if (cops && cops->graft) {
1da177e4
LT
722 unsigned long cl = cops->get(parent, classid);
723 if (cl) {
99194cff 724 err = cops->graft(parent, cl, new, &old);
1da177e4 725 cops->put(parent, cl);
c9f1d038
PM
726 } else
727 err = -ENOENT;
1da177e4 728 }
99194cff 729 if (!err)
7316ae88 730 notify_and_destroy(net, skb, n, classid, old, new);
1da177e4
LT
731 }
732 return err;
733}
734
25bfcd5a
JP
735/* lockdep annotation is needed for ingress; egress gets it only for name */
736static struct lock_class_key qdisc_tx_lock;
737static struct lock_class_key qdisc_rx_lock;
738
1da177e4
LT
739/*
740 Allocate and initialize new qdisc.
741
742 Parameters are passed via opt.
743 */
744
745static struct Qdisc *
bb949fbd 746qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
23bcf634
PM
747 struct Qdisc *p, u32 parent, u32 handle,
748 struct nlattr **tca, int *errp)
1da177e4
LT
749{
750 int err;
1e90474c 751 struct nlattr *kind = tca[TCA_KIND];
1da177e4
LT
752 struct Qdisc *sch;
753 struct Qdisc_ops *ops;
175f9c1b 754 struct qdisc_size_table *stab;
1da177e4
LT
755
756 ops = qdisc_lookup_ops(kind);
95a5afca 757#ifdef CONFIG_MODULES
1da177e4
LT
758 if (ops == NULL && kind != NULL) {
759 char name[IFNAMSIZ];
1e90474c 760 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1da177e4
LT
761 /* We dropped the RTNL semaphore in order to
762 * perform the module load. So, even if we
763 * succeeded in loading the module we have to
764 * tell the caller to replay the request. We
765 * indicate this using -EAGAIN.
766 * We replay the request because the device may
767 * go away in the mean time.
768 */
769 rtnl_unlock();
770 request_module("sch_%s", name);
771 rtnl_lock();
772 ops = qdisc_lookup_ops(kind);
773 if (ops != NULL) {
774 /* We will try again qdisc_lookup_ops,
775 * so don't keep a reference.
776 */
777 module_put(ops->owner);
778 err = -EAGAIN;
779 goto err_out;
780 }
781 }
782 }
783#endif
784
b9e2cc0f 785 err = -ENOENT;
1da177e4
LT
786 if (ops == NULL)
787 goto err_out;
788
5ce2d488 789 sch = qdisc_alloc(dev_queue, ops);
3d54b82f
TG
790 if (IS_ERR(sch)) {
791 err = PTR_ERR(sch);
1da177e4 792 goto err_out2;
3d54b82f 793 }
1da177e4 794
ffc8fefa
PM
795 sch->parent = parent;
796
3d54b82f 797 if (handle == TC_H_INGRESS) {
1da177e4 798 sch->flags |= TCQ_F_INGRESS;
3d54b82f 799 handle = TC_H_MAKE(TC_H_INGRESS, 0);
25bfcd5a 800 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
fd44de7c 801 } else {
fd44de7c
PM
802 if (handle == 0) {
803 handle = qdisc_alloc_handle(dev);
804 err = -ENOMEM;
805 if (handle == 0)
806 goto err_out3;
807 }
25bfcd5a 808 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1da177e4
LT
809 }
810
3d54b82f 811 sch->handle = handle;
1da177e4 812
1e90474c 813 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
175f9c1b
JK
814 if (tca[TCA_STAB]) {
815 stab = qdisc_get_stab(tca[TCA_STAB]);
816 if (IS_ERR(stab)) {
817 err = PTR_ERR(stab);
7c64b9f3 818 goto err_out4;
175f9c1b
JK
819 }
820 sch->stab = stab;
821 }
1e90474c 822 if (tca[TCA_RATE]) {
f6f9b93f
JP
823 spinlock_t *root_lock;
824
23bcf634
PM
825 err = -EOPNOTSUPP;
826 if (sch->flags & TCQ_F_MQROOT)
827 goto err_out4;
828
f6f9b93f 829 if ((sch->parent != TC_H_ROOT) &&
23bcf634
PM
830 !(sch->flags & TCQ_F_INGRESS) &&
831 (!p || !(p->flags & TCQ_F_MQROOT)))
f6f9b93f
JP
832 root_lock = qdisc_root_sleeping_lock(sch);
833 else
834 root_lock = qdisc_lock(sch);
835
023e09a7 836 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
f6f9b93f 837 root_lock, tca[TCA_RATE]);
23bcf634
PM
838 if (err)
839 goto err_out4;
023e09a7 840 }
f6e0b239
JP
841
842 qdisc_list_add(sch);
1da177e4 843
1da177e4
LT
844 return sch;
845 }
846err_out3:
847 dev_put(dev);
3d54b82f 848 kfree((char *) sch - sch->padded);
1da177e4
LT
849err_out2:
850 module_put(ops->owner);
851err_out:
852 *errp = err;
1da177e4 853 return NULL;
23bcf634
PM
854
855err_out4:
856 /*
857 * Any broken qdiscs that would require a ops->reset() here?
858 * The qdisc was never in action so it shouldn't be necessary.
859 */
7c64b9f3 860 qdisc_put_stab(sch->stab);
23bcf634
PM
861 if (ops->destroy)
862 ops->destroy(sch);
863 goto err_out3;
1da177e4
LT
864}
865
1e90474c 866static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1da177e4 867{
175f9c1b
JK
868 struct qdisc_size_table *stab = NULL;
869 int err = 0;
1da177e4 870
175f9c1b 871 if (tca[TCA_OPTIONS]) {
1da177e4
LT
872 if (sch->ops->change == NULL)
873 return -EINVAL;
1e90474c 874 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1da177e4
LT
875 if (err)
876 return err;
877 }
175f9c1b
JK
878
879 if (tca[TCA_STAB]) {
880 stab = qdisc_get_stab(tca[TCA_STAB]);
881 if (IS_ERR(stab))
882 return PTR_ERR(stab);
883 }
884
885 qdisc_put_stab(sch->stab);
886 sch->stab = stab;
887
23bcf634 888 if (tca[TCA_RATE]) {
71bcb09a
SH
889 /* NB: ignores errors from replace_estimator
890 because change can't be undone. */
23bcf634
PM
891 if (sch->flags & TCQ_F_MQROOT)
892 goto out;
1da177e4 893 gen_replace_estimator(&sch->bstats, &sch->rate_est,
71bcb09a
SH
894 qdisc_root_sleeping_lock(sch),
895 tca[TCA_RATE]);
23bcf634
PM
896 }
897out:
1da177e4
LT
898 return 0;
899}
900
901struct check_loop_arg
902{
903 struct qdisc_walker w;
904 struct Qdisc *p;
905 int depth;
906};
907
908static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
909
910static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
911{
912 struct check_loop_arg arg;
913
914 if (q->ops->cl_ops == NULL)
915 return 0;
916
917 arg.w.stop = arg.w.skip = arg.w.count = 0;
918 arg.w.fn = check_loop_fn;
919 arg.depth = depth;
920 arg.p = p;
921 q->ops->cl_ops->walk(q, &arg.w);
922 return arg.w.stop ? -ELOOP : 0;
923}
924
925static int
926check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
927{
928 struct Qdisc *leaf;
20fea08b 929 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1da177e4
LT
930 struct check_loop_arg *arg = (struct check_loop_arg *)w;
931
932 leaf = cops->leaf(q, cl);
933 if (leaf) {
934 if (leaf == arg->p || arg->depth > 7)
935 return -ELOOP;
936 return check_loop(leaf, arg->p, arg->depth + 1);
937 }
938 return 0;
939}
940
941/*
942 * Delete/get qdisc.
943 */
944
945static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
946{
3b1e0a65 947 struct net *net = sock_net(skb->sk);
1da177e4 948 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 949 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
950 struct net_device *dev;
951 u32 clid = tcm->tcm_parent;
952 struct Qdisc *q = NULL;
953 struct Qdisc *p = NULL;
954 int err;
955
7316ae88 956 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
957 return -ENODEV;
958
1e90474c
PM
959 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
960 if (err < 0)
961 return err;
962
1da177e4
LT
963 if (clid) {
964 if (clid != TC_H_ROOT) {
965 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
966 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
967 return -ENOENT;
968 q = qdisc_leaf(p, clid);
969 } else { /* ingress */
8123b421 970 q = dev->rx_queue.qdisc_sleeping;
10297b99 971 }
1da177e4 972 } else {
af356afa 973 q = dev->qdisc;
1da177e4
LT
974 }
975 if (!q)
976 return -ENOENT;
977
978 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
979 return -EINVAL;
980 } else {
981 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
982 return -ENOENT;
983 }
984
1e90474c 985 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
986 return -EINVAL;
987
988 if (n->nlmsg_type == RTM_DELQDISC) {
989 if (!clid)
990 return -EINVAL;
991 if (q->handle == 0)
992 return -ENOENT;
99194cff 993 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1da177e4 994 return err;
1da177e4 995 } else {
7316ae88 996 qdisc_notify(net, skb, n, clid, NULL, q);
1da177e4
LT
997 }
998 return 0;
999}
1000
1001/*
1002 Create/change qdisc.
1003 */
1004
1005static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1006{
3b1e0a65 1007 struct net *net = sock_net(skb->sk);
1da177e4 1008 struct tcmsg *tcm;
1e90474c 1009 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1010 struct net_device *dev;
1011 u32 clid;
1012 struct Qdisc *q, *p;
1013 int err;
1014
1015replay:
1016 /* Reinit, just in case something touches this. */
1017 tcm = NLMSG_DATA(n);
1da177e4
LT
1018 clid = tcm->tcm_parent;
1019 q = p = NULL;
1020
7316ae88 1021 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1022 return -ENODEV;
1023
1e90474c
PM
1024 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1025 if (err < 0)
1026 return err;
1027
1da177e4
LT
1028 if (clid) {
1029 if (clid != TC_H_ROOT) {
1030 if (clid != TC_H_INGRESS) {
1031 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1032 return -ENOENT;
1033 q = qdisc_leaf(p, clid);
1034 } else { /*ingress */
8123b421 1035 q = dev->rx_queue.qdisc_sleeping;
1da177e4
LT
1036 }
1037 } else {
af356afa 1038 q = dev->qdisc;
1da177e4
LT
1039 }
1040
1041 /* It may be default qdisc, ignore it */
1042 if (q && q->handle == 0)
1043 q = NULL;
1044
1045 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1046 if (tcm->tcm_handle) {
1047 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1048 return -EEXIST;
1049 if (TC_H_MIN(tcm->tcm_handle))
1050 return -EINVAL;
1051 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1052 goto create_n_graft;
1053 if (n->nlmsg_flags&NLM_F_EXCL)
1054 return -EEXIST;
1e90474c 1055 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1056 return -EINVAL;
1057 if (q == p ||
1058 (p && check_loop(q, p, 0)))
1059 return -ELOOP;
1060 atomic_inc(&q->refcnt);
1061 goto graft;
1062 } else {
1063 if (q == NULL)
1064 goto create_n_graft;
1065
1066 /* This magic test requires explanation.
1067 *
1068 * We know, that some child q is already
1069 * attached to this parent and have choice:
1070 * either to change it or to create/graft new one.
1071 *
1072 * 1. We are allowed to create/graft only
1073 * if CREATE and REPLACE flags are set.
1074 *
1075 * 2. If EXCL is set, requestor wanted to say,
1076 * that qdisc tcm_handle is not expected
1077 * to exist, so that we choose create/graft too.
1078 *
1079 * 3. The last case is when no flags are set.
1080 * Alas, it is sort of hole in API, we
1081 * cannot decide what to do unambiguously.
1082 * For now we select create/graft, if
1083 * user gave KIND, which does not match existing.
1084 */
1085 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1086 (n->nlmsg_flags&NLM_F_REPLACE) &&
1087 ((n->nlmsg_flags&NLM_F_EXCL) ||
1e90474c
PM
1088 (tca[TCA_KIND] &&
1089 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1da177e4
LT
1090 goto create_n_graft;
1091 }
1092 }
1093 } else {
1094 if (!tcm->tcm_handle)
1095 return -EINVAL;
1096 q = qdisc_lookup(dev, tcm->tcm_handle);
1097 }
1098
1099 /* Change qdisc parameters */
1100 if (q == NULL)
1101 return -ENOENT;
1102 if (n->nlmsg_flags&NLM_F_EXCL)
1103 return -EEXIST;
1e90474c 1104 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1105 return -EINVAL;
1106 err = qdisc_change(q, tca);
1107 if (err == 0)
7316ae88 1108 qdisc_notify(net, skb, n, clid, NULL, q);
1da177e4
LT
1109 return err;
1110
1111create_n_graft:
1112 if (!(n->nlmsg_flags&NLM_F_CREATE))
1113 return -ENOENT;
1114 if (clid == TC_H_INGRESS)
23bcf634 1115 q = qdisc_create(dev, &dev->rx_queue, p,
bb949fbd 1116 tcm->tcm_parent, tcm->tcm_parent,
ffc8fefa 1117 tca, &err);
6ec1c69a 1118 else {
926e61b7 1119 struct netdev_queue *dev_queue;
6ec1c69a
DM
1120
1121 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
926e61b7
JP
1122 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1123 else if (p)
1124 dev_queue = p->dev_queue;
1125 else
1126 dev_queue = netdev_get_tx_queue(dev, 0);
6ec1c69a 1127
926e61b7 1128 q = qdisc_create(dev, dev_queue, p,
bb949fbd 1129 tcm->tcm_parent, tcm->tcm_handle,
ffc8fefa 1130 tca, &err);
6ec1c69a 1131 }
1da177e4
LT
1132 if (q == NULL) {
1133 if (err == -EAGAIN)
1134 goto replay;
1135 return err;
1136 }
1137
1138graft:
e5befbd9
IJ
1139 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140 if (err) {
1141 if (q)
1142 qdisc_destroy(q);
1143 return err;
1da177e4 1144 }
e5befbd9 1145
1da177e4
LT
1146 return 0;
1147}
1148
1149static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 1150 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1151{
1152 struct tcmsg *tcm;
1153 struct nlmsghdr *nlh;
27a884dc 1154 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1155 struct gnet_dump d;
1156
e431b8c0 1157 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1158 tcm = NLMSG_DATA(nlh);
1159 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
1160 tcm->tcm__pad1 = 0;
1161 tcm->tcm__pad2 = 0;
5ce2d488 1162 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1163 tcm->tcm_parent = clid;
1164 tcm->tcm_handle = q->handle;
1165 tcm->tcm_info = atomic_read(&q->refcnt);
57e1c487 1166 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1167 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1e90474c 1168 goto nla_put_failure;
1da177e4
LT
1169 q->qstats.qlen = q->q.qlen;
1170
175f9c1b
JK
1171 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1172 goto nla_put_failure;
1173
102396ae
JP
1174 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1175 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1176 goto nla_put_failure;
1da177e4
LT
1177
1178 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1e90474c 1179 goto nla_put_failure;
1da177e4
LT
1180
1181 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
d250a5f9 1182 gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1da177e4 1183 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1e90474c 1184 goto nla_put_failure;
10297b99 1185
1da177e4 1186 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1187 goto nla_put_failure;
10297b99 1188
27a884dc 1189 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1190 return skb->len;
1191
1192nlmsg_failure:
1e90474c 1193nla_put_failure:
dc5fc579 1194 nlmsg_trim(skb, b);
1da177e4
LT
1195 return -1;
1196}
1197
53b0f080
ED
1198static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1199{
1200 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1201}
1202
7316ae88
TG
1203static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1204 struct nlmsghdr *n, u32 clid,
1205 struct Qdisc *old, struct Qdisc *new)
1da177e4
LT
1206{
1207 struct sk_buff *skb;
1208 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1209
1210 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1211 if (!skb)
1212 return -ENOBUFS;
1213
53b0f080 1214 if (old && !tc_qdisc_dump_ignore(old)) {
1da177e4
LT
1215 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1216 goto err_out;
1217 }
53b0f080 1218 if (new && !tc_qdisc_dump_ignore(new)) {
1da177e4
LT
1219 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1220 goto err_out;
1221 }
1222
1223 if (skb->len)
7316ae88 1224 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1225
1226err_out:
1227 kfree_skb(skb);
1228 return -EINVAL;
1229}
1230
30723673
DM
1231static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1232 struct netlink_callback *cb,
1233 int *q_idx_p, int s_q_idx)
1234{
1235 int ret = 0, q_idx = *q_idx_p;
1236 struct Qdisc *q;
1237
1238 if (!root)
1239 return 0;
1240
1241 q = root;
1242 if (q_idx < s_q_idx) {
1243 q_idx++;
1244 } else {
1245 if (!tc_qdisc_dump_ignore(q) &&
1246 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248 goto done;
1249 q_idx++;
1250 }
1251 list_for_each_entry(q, &root->list, list) {
1252 if (q_idx < s_q_idx) {
1253 q_idx++;
1254 continue;
1255 }
1256 if (!tc_qdisc_dump_ignore(q) &&
1257 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1258 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1259 goto done;
1260 q_idx++;
1261 }
1262
1263out:
1264 *q_idx_p = q_idx;
1265 return ret;
1266done:
1267 ret = -1;
1268 goto out;
1269}
1270
1da177e4
LT
1271static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1272{
3b1e0a65 1273 struct net *net = sock_net(skb->sk);
1da177e4
LT
1274 int idx, q_idx;
1275 int s_idx, s_q_idx;
1276 struct net_device *dev;
1da177e4
LT
1277
1278 s_idx = cb->args[0];
1279 s_q_idx = q_idx = cb->args[1];
f1e9016d 1280
1281 rcu_read_lock();
7562f876 1282 idx = 0;
7316ae88 1283 for_each_netdev_rcu(net, dev) {
30723673
DM
1284 struct netdev_queue *dev_queue;
1285
1da177e4 1286 if (idx < s_idx)
7562f876 1287 goto cont;
1da177e4
LT
1288 if (idx > s_idx)
1289 s_q_idx = 0;
1da177e4 1290 q_idx = 0;
30723673 1291
af356afa 1292 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
30723673
DM
1293 goto done;
1294
1295 dev_queue = &dev->rx_queue;
827ebd64 1296 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
30723673
DM
1297 goto done;
1298
7562f876
PE
1299cont:
1300 idx++;
1da177e4
LT
1301 }
1302
1303done:
f1e9016d 1304 rcu_read_unlock();
1da177e4
LT
1305
1306 cb->args[0] = idx;
1307 cb->args[1] = q_idx;
1308
1309 return skb->len;
1310}
1311
1312
1313
1314/************************************************
1315 * Traffic classes manipulation. *
1316 ************************************************/
1317
1318
1319
1320static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1321{
3b1e0a65 1322 struct net *net = sock_net(skb->sk);
1da177e4 1323 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 1324 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1325 struct net_device *dev;
1326 struct Qdisc *q = NULL;
20fea08b 1327 const struct Qdisc_class_ops *cops;
1da177e4
LT
1328 unsigned long cl = 0;
1329 unsigned long new_cl;
1330 u32 pid = tcm->tcm_parent;
1331 u32 clid = tcm->tcm_handle;
1332 u32 qid = TC_H_MAJ(clid);
1333 int err;
1334
7316ae88 1335 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1336 return -ENODEV;
1337
1e90474c
PM
1338 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1339 if (err < 0)
1340 return err;
1341
1da177e4
LT
1342 /*
1343 parent == TC_H_UNSPEC - unspecified parent.
1344 parent == TC_H_ROOT - class is root, which has no parent.
1345 parent == X:0 - parent is root class.
1346 parent == X:Y - parent is a node in hierarchy.
1347 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1348
1349 handle == 0:0 - generate handle from kernel pool.
1350 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1351 handle == X:Y - clear.
1352 handle == X:0 - root class.
1353 */
1354
1355 /* Step 1. Determine qdisc handle X:0 */
1356
1357 if (pid != TC_H_ROOT) {
1358 u32 qid1 = TC_H_MAJ(pid);
1359
1360 if (qid && qid1) {
1361 /* If both majors are known, they must be identical. */
1362 if (qid != qid1)
1363 return -EINVAL;
1364 } else if (qid1) {
1365 qid = qid1;
1366 } else if (qid == 0)
af356afa 1367 qid = dev->qdisc->handle;
1da177e4
LT
1368
1369 /* Now qid is genuine qdisc handle consistent
1370 both with parent and child.
1371
1372 TC_H_MAJ(pid) still may be unspecified, complete it now.
1373 */
1374 if (pid)
1375 pid = TC_H_MAKE(qid, pid);
1376 } else {
1377 if (qid == 0)
af356afa 1378 qid = dev->qdisc->handle;
1da177e4
LT
1379 }
1380
1381 /* OK. Locate qdisc */
10297b99 1382 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
1383 return -ENOENT;
1384
1385 /* An check that it supports classes */
1386 cops = q->ops->cl_ops;
1387 if (cops == NULL)
1388 return -EINVAL;
1389
1390 /* Now try to get class */
1391 if (clid == 0) {
1392 if (pid == TC_H_ROOT)
1393 clid = qid;
1394 } else
1395 clid = TC_H_MAKE(qid, clid);
1396
1397 if (clid)
1398 cl = cops->get(q, clid);
1399
1400 if (cl == 0) {
1401 err = -ENOENT;
1402 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1403 goto out;
1404 } else {
1405 switch (n->nlmsg_type) {
10297b99 1406 case RTM_NEWTCLASS:
1da177e4
LT
1407 err = -EEXIST;
1408 if (n->nlmsg_flags&NLM_F_EXCL)
1409 goto out;
1410 break;
1411 case RTM_DELTCLASS:
de6d5cdf
PM
1412 err = -EOPNOTSUPP;
1413 if (cops->delete)
1414 err = cops->delete(q, cl);
1da177e4 1415 if (err == 0)
7316ae88 1416 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1da177e4
LT
1417 goto out;
1418 case RTM_GETTCLASS:
7316ae88 1419 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1da177e4
LT
1420 goto out;
1421 default:
1422 err = -EINVAL;
1423 goto out;
1424 }
1425 }
1426
1427 new_cl = cl;
de6d5cdf
PM
1428 err = -EOPNOTSUPP;
1429 if (cops->change)
1430 err = cops->change(q, clid, pid, tca, &new_cl);
1da177e4 1431 if (err == 0)
7316ae88 1432 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1da177e4
LT
1433
1434out:
1435 if (cl)
1436 cops->put(q, cl);
1437
1438 return err;
1439}
1440
1441
1442static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1443 unsigned long cl,
e431b8c0 1444 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1445{
1446 struct tcmsg *tcm;
1447 struct nlmsghdr *nlh;
27a884dc 1448 unsigned char *b = skb_tail_pointer(skb);
1da177e4 1449 struct gnet_dump d;
20fea08b 1450 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1da177e4 1451
e431b8c0 1452 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1453 tcm = NLMSG_DATA(nlh);
1454 tcm->tcm_family = AF_UNSPEC;
16ebb5e0
ED
1455 tcm->tcm__pad1 = 0;
1456 tcm->tcm__pad2 = 0;
5ce2d488 1457 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1458 tcm->tcm_parent = q->handle;
1459 tcm->tcm_handle = q->handle;
1460 tcm->tcm_info = 0;
57e1c487 1461 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1462 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1e90474c 1463 goto nla_put_failure;
1da177e4 1464
102396ae
JP
1465 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1466 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1467 goto nla_put_failure;
1da177e4
LT
1468
1469 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1e90474c 1470 goto nla_put_failure;
1da177e4
LT
1471
1472 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1473 goto nla_put_failure;
1da177e4 1474
27a884dc 1475 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1476 return skb->len;
1477
1478nlmsg_failure:
1e90474c 1479nla_put_failure:
dc5fc579 1480 nlmsg_trim(skb, b);
1da177e4
LT
1481 return -1;
1482}
1483
7316ae88
TG
1484static int tclass_notify(struct net *net, struct sk_buff *oskb,
1485 struct nlmsghdr *n, struct Qdisc *q,
1486 unsigned long cl, int event)
1da177e4
LT
1487{
1488 struct sk_buff *skb;
1489 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1490
1491 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1492 if (!skb)
1493 return -ENOBUFS;
1494
1495 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1496 kfree_skb(skb);
1497 return -EINVAL;
1498 }
1499
7316ae88 1500 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1501}
1502
1503struct qdisc_dump_args
1504{
1505 struct qdisc_walker w;
1506 struct sk_buff *skb;
1507 struct netlink_callback *cb;
1508};
1509
1510static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1511{
1512 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1513
1514 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1515 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1516}
1517
30723673
DM
1518static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1519 struct tcmsg *tcm, struct netlink_callback *cb,
1520 int *t_p, int s_t)
1521{
1522 struct qdisc_dump_args arg;
1523
1524 if (tc_qdisc_dump_ignore(q) ||
1525 *t_p < s_t || !q->ops->cl_ops ||
1526 (tcm->tcm_parent &&
1527 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1528 (*t_p)++;
1529 return 0;
1530 }
1531 if (*t_p > s_t)
1532 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1533 arg.w.fn = qdisc_class_dump;
1534 arg.skb = skb;
1535 arg.cb = cb;
1536 arg.w.stop = 0;
1537 arg.w.skip = cb->args[1];
1538 arg.w.count = 0;
1539 q->ops->cl_ops->walk(q, &arg.w);
1540 cb->args[1] = arg.w.count;
1541 if (arg.w.stop)
1542 return -1;
1543 (*t_p)++;
1544 return 0;
1545}
1546
1547static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1548 struct tcmsg *tcm, struct netlink_callback *cb,
1549 int *t_p, int s_t)
1550{
1551 struct Qdisc *q;
1552
1553 if (!root)
1554 return 0;
1555
1556 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1557 return -1;
1558
1559 list_for_each_entry(q, &root->list, list) {
1560 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1561 return -1;
1562 }
1563
1564 return 0;
1565}
1566
1da177e4
LT
1567static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1568{
30723673 1569 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
3b1e0a65 1570 struct net *net = sock_net(skb->sk);
30723673 1571 struct netdev_queue *dev_queue;
1da177e4 1572 struct net_device *dev;
30723673 1573 int t, s_t;
1da177e4
LT
1574
1575 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1576 return 0;
7316ae88 1577 if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1578 return 0;
1579
1580 s_t = cb->args[0];
1581 t = 0;
1582
af356afa 1583 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
30723673
DM
1584 goto done;
1585
1586 dev_queue = &dev->rx_queue;
8123b421 1587 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
30723673 1588 goto done;
1da177e4 1589
30723673 1590done:
1da177e4
LT
1591 cb->args[0] = t;
1592
1593 dev_put(dev);
1594 return skb->len;
1595}
1596
1597/* Main classifier routine: scans classifier chain attached
1598 to this qdisc, (optionally) tests for protocol and asks
1599 specific classifiers.
1600 */
73ca4918
PM
1601int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1602 struct tcf_result *res)
1603{
1604 __be16 protocol = skb->protocol;
1605 int err = 0;
1606
1607 for (; tp; tp = tp->next) {
1608 if ((tp->protocol == protocol ||
1609 tp->protocol == htons(ETH_P_ALL)) &&
1610 (err = tp->classify(skb, tp, res)) >= 0) {
1611#ifdef CONFIG_NET_CLS_ACT
1612 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1613 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1614#endif
1615 return err;
1616 }
1617 }
1618 return -1;
1619}
1620EXPORT_SYMBOL(tc_classify_compat);
1621
1da177e4 1622int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
73ca4918 1623 struct tcf_result *res)
1da177e4
LT
1624{
1625 int err = 0;
73ca4918 1626 __be16 protocol;
1da177e4
LT
1627#ifdef CONFIG_NET_CLS_ACT
1628 struct tcf_proto *otp = tp;
1629reclassify:
1630#endif
1631 protocol = skb->protocol;
1632
73ca4918 1633 err = tc_classify_compat(skb, tp, res);
1da177e4 1634#ifdef CONFIG_NET_CLS_ACT
73ca4918
PM
1635 if (err == TC_ACT_RECLASSIFY) {
1636 u32 verd = G_TC_VERD(skb->tc_verd);
1637 tp = otp;
1638
1639 if (verd++ >= MAX_REC_LOOP) {
b60b6592 1640 if (net_ratelimit())
1641 printk(KERN_NOTICE
1642 "%s: packet reclassify loop"
1643 " rule prio %u protocol %02x\n",
1644 tp->q->ops->id,
1645 tp->prio & 0xffff, ntohs(tp->protocol));
73ca4918 1646 return TC_ACT_SHOT;
1da177e4 1647 }
73ca4918
PM
1648 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1649 goto reclassify;
1da177e4 1650 }
73ca4918
PM
1651#endif
1652 return err;
1da177e4 1653}
73ca4918 1654EXPORT_SYMBOL(tc_classify);
1da177e4 1655
a48b5a61
PM
1656void tcf_destroy(struct tcf_proto *tp)
1657{
1658 tp->ops->destroy(tp);
1659 module_put(tp->ops->owner);
1660 kfree(tp);
1661}
1662
ff31ab56 1663void tcf_destroy_chain(struct tcf_proto **fl)
a48b5a61
PM
1664{
1665 struct tcf_proto *tp;
1666
ff31ab56
PM
1667 while ((tp = *fl) != NULL) {
1668 *fl = tp->next;
a48b5a61
PM
1669 tcf_destroy(tp);
1670 }
1671}
1672EXPORT_SYMBOL(tcf_destroy_chain);
1673
1da177e4
LT
1674#ifdef CONFIG_PROC_FS
1675static int psched_show(struct seq_file *seq, void *v)
1676{
3c0cfc13
PM
1677 struct timespec ts;
1678
1679 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1da177e4 1680 seq_printf(seq, "%08x %08x %08x %08x\n",
ca44d6e6 1681 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
514bca32 1682 1000000,
3c0cfc13 1683 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1da177e4
LT
1684
1685 return 0;
1686}
1687
1688static int psched_open(struct inode *inode, struct file *file)
1689{
7e5ab157 1690 return single_open(file, psched_show, NULL);
1da177e4
LT
1691}
1692
da7071d7 1693static const struct file_operations psched_fops = {
1da177e4
LT
1694 .owner = THIS_MODULE,
1695 .open = psched_open,
1696 .read = seq_read,
1697 .llseek = seq_lseek,
1698 .release = single_release,
10297b99 1699};
7316ae88
TG
1700
1701static int __net_init psched_net_init(struct net *net)
1702{
1703 struct proc_dir_entry *e;
1704
1705 e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1706 if (e == NULL)
1707 return -ENOMEM;
1708
1709 return 0;
1710}
1711
1712static void __net_exit psched_net_exit(struct net *net)
1713{
1714 proc_net_remove(net, "psched");
7316ae88
TG
1715}
1716#else
1717static int __net_init psched_net_init(struct net *net)
1718{
1719 return 0;
1720}
1721
1722static void __net_exit psched_net_exit(struct net *net)
1723{
1724}
1da177e4
LT
1725#endif
1726
7316ae88
TG
1727static struct pernet_operations psched_net_ops = {
1728 .init = psched_net_init,
1729 .exit = psched_net_exit,
1730};
1731
1da177e4
LT
1732static int __init pktsched_init(void)
1733{
7316ae88
TG
1734 int err;
1735
1736 err = register_pernet_subsys(&psched_net_ops);
1737 if (err) {
1738 printk(KERN_ERR "pktsched_init: "
1739 "cannot initialize per netns operations\n");
1740 return err;
1741 }
1742
1da177e4
LT
1743 register_qdisc(&pfifo_qdisc_ops);
1744 register_qdisc(&bfifo_qdisc_ops);
57dbb2d8 1745 register_qdisc(&pfifo_head_drop_qdisc_ops);
6ec1c69a 1746 register_qdisc(&mq_qdisc_ops);
1da177e4 1747
be577ddc
TG
1748 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1749 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1750 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1751 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1752 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1753 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1754
1da177e4
LT
1755 return 0;
1756}
1757
1758subsys_initcall(pktsched_init);