net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/pkt_cls.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         struct tcf_block *block;
 130         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 131         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 132
 133         struct red_parms *red_parms;
 134         struct tc_sfqred_stats stats;
 135         struct sfq_slot *tail;          /* current slot in round */
 136
 137         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 138                                         /* Linked lists of slots, indexed by depth
 139                                          * dep[0] : list of unused flows
 140                                          * dep[1] : list of flows with 1 packet
 141                                          * dep[X] : list of flows with X packets
 142                                          */
 143
 144         unsigned int    maxflows;       /* number of flows in flows array */
 145         int             perturb_period;
 146         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 147         struct timer_list perturb_timer;
 148 };
 149
 150 /*
 151  * sfq_head are either in a sfq_slot or in dep[] array
 152  */
 153 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 154 {
 155         if (val < SFQ_MAX_FLOWS)
 156                 return &q->slots[val].dep;
 157         return &q->dep[val - SFQ_MAX_FLOWS];
 158 }
 159
 160 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 161                              const struct sk_buff *skb)
 162 {
 163         return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 164 }
 165
 166 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 167                                  int *qerr)
 168 {
 169         struct sfq_sched_data *q = qdisc_priv(sch);
 170         struct tcf_result res;
 171         struct tcf_proto *fl;
 172         int result;
 173
 174         if (TC_H_MAJ(skb->priority) == sch->handle &&
 175             TC_H_MIN(skb->priority) > 0 &&
 176             TC_H_MIN(skb->priority) <= q->divisor)
 177                 return TC_H_MIN(skb->priority);
 178
 179         fl = rcu_dereference_bh(q->filter_list);
 180         if (!fl)
 181                 return sfq_hash(q, skb) + 1;
 182
 183         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 184         result = tcf_classify(skb, fl, &res, false);
 185         if (result >= 0) {
 186 #ifdef CONFIG_NET_CLS_ACT
 187                 switch (result) {
 188                 case TC_ACT_STOLEN:
 189                 case TC_ACT_QUEUED:
 190                 case TC_ACT_TRAP:
 191                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 192                 case TC_ACT_SHOT:
 193                         return 0;
 194                 }
 195 #endif
 196                 if (TC_H_MIN(res.classid) <= q->divisor)
 197                         return TC_H_MIN(res.classid);
 198         }
 199         return 0;
 200 }
 201
 202 /*
 203  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 204  */
 205 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 206 {
 207         sfq_index p, n;
 208         struct sfq_slot *slot = &q->slots[x];
 209         int qlen = slot->qlen;
 210
 211         p = qlen + SFQ_MAX_FLOWS;
 212         n = q->dep[qlen].next;
 213
 214         slot->dep.next = n;
 215         slot->dep.prev = p;
 216
 217         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 218         sfq_dep_head(q, n)->prev = x;
 219 }
 220
 221 #define sfq_unlink(q, x, n, p)                  \
 222         do {                                    \
 223                 n = q->slots[x].dep.next;       \
 224                 p = q->slots[x].dep.prev;       \
 225                 sfq_dep_head(q, p)->next = n;   \
 226                 sfq_dep_head(q, n)->prev = p;   \
 227         } while (0)
 228
 229
 230 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 231 {
 232         sfq_index p, n;
 233         int d;
 234
 235         sfq_unlink(q, x, n, p);
 236
 237         d = q->slots[x].qlen--;
 238         if (n == p && q->cur_depth == d)
 239                 q->cur_depth--;
 240         sfq_link(q, x);
 241 }
 242
 243 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 244 {
 245         sfq_index p, n;
 246         int d;
 247
 248         sfq_unlink(q, x, n, p);
 249
 250         d = ++q->slots[x].qlen;
 251         if (q->cur_depth < d)
 252                 q->cur_depth = d;
 253         sfq_link(q, x);
 254 }
 255
 256 /* helper functions : might be changed when/if skb use a standard list_head */
 257
 258 /* remove one skb from tail of slot queue */
 259 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 260 {
 261         struct sk_buff *skb = slot->skblist_prev;
 262
 263         slot->skblist_prev = skb->prev;
 264         skb->prev->next = (struct sk_buff *)slot;
 265         skb->next = skb->prev = NULL;
 266         return skb;
 267 }
 268
 269 /* remove one skb from head of slot queue */
 270 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 271 {
 272         struct sk_buff *skb = slot->skblist_next;
 273
 274         slot->skblist_next = skb->next;
 275         skb->next->prev = (struct sk_buff *)slot;
 276         skb->next = skb->prev = NULL;
 277         return skb;
 278 }
 279
 280 static inline void slot_queue_init(struct sfq_slot *slot)
 281 {
 282         memset(slot, 0, sizeof(*slot));
 283         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 284 }
 285
 286 /* add skb to slot queue (tail add) */
 287 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 288 {
 289         skb->prev = slot->skblist_prev;
 290         skb->next = (struct sk_buff *)slot;
 291         slot->skblist_prev->next = skb;
 292         slot->skblist_prev = skb;
 293 }
 294
 295 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 296 {
 297         struct sfq_sched_data *q = qdisc_priv(sch);
 298         sfq_index x, d = q->cur_depth;
 299         struct sk_buff *skb;
 300         unsigned int len;
 301         struct sfq_slot *slot;
 302
 303         /* Queue is full! Find the longest slot and drop tail packet from it */
 304         if (d > 1) {
 305                 x = q->dep[d].next;
 306                 slot = &q->slots[x];
 307 drop:
 308                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 309                 len = qdisc_pkt_len(skb);
 310                 slot->backlog -= len;
 311                 sfq_dec(q, x);
 312                 sch->q.qlen--;
 313                 qdisc_qstats_backlog_dec(sch, skb);
 314                 qdisc_drop(skb, sch, to_free);
 315                 return len;
 316         }
 317
 318         if (d == 1) {
 319                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 320                 x = q->tail->next;
 321                 slot = &q->slots[x];
 322                 q->tail->next = slot->next;
 323                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 324                 goto drop;
 325         }
 326
 327         return 0;
 328 }
 329
 330 /* Is ECN parameter configured */
 331 static int sfq_prob_mark(const struct sfq_sched_data *q)
 332 {
 333         return q->flags & TC_RED_ECN;
 334 }
 335
 336 /* Should packets over max threshold just be marked */
 337 static int sfq_hard_mark(const struct sfq_sched_data *q)
 338 {
 339         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 340 }
 341
 342 static int sfq_headdrop(const struct sfq_sched_data *q)
 343 {
 344         return q->headdrop;
 345 }
 346
 347 static int
 348 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 349 {
 350         struct sfq_sched_data *q = qdisc_priv(sch);
 351         unsigned int hash, dropped;
 352         sfq_index x, qlen;
 353         struct sfq_slot *slot;
 354         int uninitialized_var(ret);
 355         struct sk_buff *head;
 356         int delta;
 357
 358         hash = sfq_classify(skb, sch, &ret);
 359         if (hash == 0) {
 360                 if (ret & __NET_XMIT_BYPASS)
 361                         qdisc_qstats_drop(sch);
 362                 __qdisc_drop(skb, to_free);
 363                 return ret;
 364         }
 365         hash--;
 366
 367         x = q->ht[hash];
 368         slot = &q->slots[x];
 369         if (x == SFQ_EMPTY_SLOT) {
 370                 x = q->dep[0].next; /* get a free slot */
 371                 if (x >= SFQ_MAX_FLOWS)
 372                         return qdisc_drop(skb, sch, to_free);
 373                 q->ht[hash] = x;
 374                 slot = &q->slots[x];
 375                 slot->hash = hash;
 376                 slot->backlog = 0; /* should already be 0 anyway... */
 377                 red_set_vars(&slot->vars);
 378                 goto enqueue;
 379         }
 380         if (q->red_parms) {
 381                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 382                                                         &slot->vars,
 383                                                         slot->backlog);
 384                 switch (red_action(q->red_parms,
 385                                    &slot->vars,
 386                                    slot->vars.qavg)) {
 387                 case RED_DONT_MARK:
 388                         break;
 389
 390                 case RED_PROB_MARK:
 391                         qdisc_qstats_overlimit(sch);
 392                         if (sfq_prob_mark(q)) {
 393                                 /* We know we have at least one packet in queue */
 394                                 if (sfq_headdrop(q) &&
 395                                     INET_ECN_set_ce(slot->skblist_next)) {
 396                                         q->stats.prob_mark_head++;
 397                                         break;
 398                                 }
 399                                 if (INET_ECN_set_ce(skb)) {
 400                                         q->stats.prob_mark++;
 401                                         break;
 402                                 }
 403                         }
 404                         q->stats.prob_drop++;
 405                         goto congestion_drop;
 406
 407                 case RED_HARD_MARK:
 408                         qdisc_qstats_overlimit(sch);
 409                         if (sfq_hard_mark(q)) {
 410                                 /* We know we have at least one packet in queue */
 411                                 if (sfq_headdrop(q) &&
 412                                     INET_ECN_set_ce(slot->skblist_next)) {
 413                                         q->stats.forced_mark_head++;
 414                                         break;
 415                                 }
 416                                 if (INET_ECN_set_ce(skb)) {
 417                                         q->stats.forced_mark++;
 418                                         break;
 419                                 }
 420                         }
 421                         q->stats.forced_drop++;
 422                         goto congestion_drop;
 423                 }
 424         }
 425
 426         if (slot->qlen >= q->maxdepth) {
 427 congestion_drop:
 428                 if (!sfq_headdrop(q))
 429                         return qdisc_drop(skb, sch, to_free);
 430
 431                 /* We know we have at least one packet in queue */
 432                 head = slot_dequeue_head(slot);
 433                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 434                 sch->qstats.backlog -= delta;
 435                 slot->backlog -= delta;
 436                 qdisc_drop(head, sch, to_free);
 437
 438                 slot_queue_add(slot, skb);
 439                 qdisc_tree_reduce_backlog(sch, 0, delta);
 440                 return NET_XMIT_CN;
 441         }
 442
 443 enqueue:
 444         qdisc_qstats_backlog_inc(sch, skb);
 445         slot->backlog += qdisc_pkt_len(skb);
 446         slot_queue_add(slot, skb);
 447         sfq_inc(q, x);
 448         if (slot->qlen == 1) {          /* The flow is new */
 449                 if (q->tail == NULL) {  /* It is the first flow */
 450                         slot->next = x;
 451                 } else {
 452                         slot->next = q->tail->next;
 453                         q->tail->next = x;
 454                 }
 455                 /* We put this flow at the end of our flow list.
 456                  * This might sound unfair for a new flow to wait after old ones,
 457                  * but we could endup servicing new flows only, and freeze old ones.
 458                  */
 459                 q->tail = slot;
 460                 /* We could use a bigger initial quantum for new flows */
 461                 slot->allot = q->scaled_quantum;
 462         }
 463         if (++sch->q.qlen <= q->limit)
 464                 return NET_XMIT_SUCCESS;
 465
 466         qlen = slot->qlen;
 467         dropped = sfq_drop(sch, to_free);
 468         /* Return Congestion Notification only if we dropped a packet
 469          * from this flow.
 470          */
 471         if (qlen != slot->qlen) {
 472                 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 473                 return NET_XMIT_CN;
 474         }
 475
 476         /* As we dropped a packet, better let upper stack know this */
 477         qdisc_tree_reduce_backlog(sch, 1, dropped);
 478         return NET_XMIT_SUCCESS;
 479 }
 480
 481 static struct sk_buff *
 482 sfq_dequeue(struct Qdisc *sch)
 483 {
 484         struct sfq_sched_data *q = qdisc_priv(sch);
 485         struct sk_buff *skb;
 486         sfq_index a, next_a;
 487         struct sfq_slot *slot;
 488
 489         /* No active slots */
 490         if (q->tail == NULL)
 491                 return NULL;
 492
 493 next_slot:
 494         a = q->tail->next;
 495         slot = &q->slots[a];
 496         if (slot->allot <= 0) {
 497                 q->tail = slot;
 498                 slot->allot += q->scaled_quantum;
 499                 goto next_slot;
 500         }
 501         skb = slot_dequeue_head(slot);
 502         sfq_dec(q, a);
 503         qdisc_bstats_update(sch, skb);
 504         sch->q.qlen--;
 505         qdisc_qstats_backlog_dec(sch, skb);
 506         slot->backlog -= qdisc_pkt_len(skb);
 507         /* Is the slot empty? */
 508         if (slot->qlen == 0) {
 509                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 510                 next_a = slot->next;
 511                 if (a == next_a) {
 512                         q->tail = NULL; /* no more active slots */
 513                         return skb;
 514                 }
 515                 q->tail->next = next_a;
 516         } else {
 517                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 518         }
 519         return skb;
 520 }
 521
 522 static void
 523 sfq_reset(struct Qdisc *sch)
 524 {
 525         struct sk_buff *skb;
 526
 527         while ((skb = sfq_dequeue(sch)) != NULL)
 528                 rtnl_kfree_skbs(skb, skb);
 529 }
 530
 531 /*
 532  * When q->perturbation is changed, we rehash all queued skbs
 533  * to avoid OOO (Out Of Order) effects.
 534  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 535  * counters.
 536  */
 537 static void sfq_rehash(struct Qdisc *sch)
 538 {
 539         struct sfq_sched_data *q = qdisc_priv(sch);
 540         struct sk_buff *skb;
 541         int i;
 542         struct sfq_slot *slot;
 543         struct sk_buff_head list;
 544         int dropped = 0;
 545         unsigned int drop_len = 0;
 546
 547         __skb_queue_head_init(&list);
 548
 549         for (i = 0; i < q->maxflows; i++) {
 550                 slot = &q->slots[i];
 551                 if (!slot->qlen)
 552                         continue;
 553                 while (slot->qlen) {
 554                         skb = slot_dequeue_head(slot);
 555                         sfq_dec(q, i);
 556                         __skb_queue_tail(&list, skb);
 557                 }
 558                 slot->backlog = 0;
 559                 red_set_vars(&slot->vars);
 560                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 561         }
 562         q->tail = NULL;
 563
 564         while ((skb = __skb_dequeue(&list)) != NULL) {
 565                 unsigned int hash = sfq_hash(q, skb);
 566                 sfq_index x = q->ht[hash];
 567
 568                 slot = &q->slots[x];
 569                 if (x == SFQ_EMPTY_SLOT) {
 570                         x = q->dep[0].next; /* get a free slot */
 571                         if (x >= SFQ_MAX_FLOWS) {
 572 drop:
 573                                 qdisc_qstats_backlog_dec(sch, skb);
 574                                 drop_len += qdisc_pkt_len(skb);
 575                                 kfree_skb(skb);
 576                                 dropped++;
 577                                 continue;
 578                         }
 579                         q->ht[hash] = x;
 580                         slot = &q->slots[x];
 581                         slot->hash = hash;
 582                 }
 583                 if (slot->qlen >= q->maxdepth)
 584                         goto drop;
 585                 slot_queue_add(slot, skb);
 586                 if (q->red_parms)
 587                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 588                                                         &slot->vars,
 589                                                         slot->backlog);
 590                 slot->backlog += qdisc_pkt_len(skb);
 591                 sfq_inc(q, x);
 592                 if (slot->qlen == 1) {          /* The flow is new */
 593                         if (q->tail == NULL) {  /* It is the first flow */
 594                                 slot->next = x;
 595                         } else {
 596                                 slot->next = q->tail->next;
 597                                 q->tail->next = x;
 598                         }
 599                         q->tail = slot;
 600                         slot->allot = q->scaled_quantum;
 601                 }
 602         }
 603         sch->q.qlen -= dropped;
 604         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 605 }
 606
 607 static void sfq_perturbation(unsigned long arg)
 608 {
 609         struct Qdisc *sch = (struct Qdisc *)arg;
 610         struct sfq_sched_data *q = qdisc_priv(sch);
 611         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 612
 613         spin_lock(root_lock);
 614         q->perturbation = prandom_u32();
 615         if (!q->filter_list && q->tail)
 616                 sfq_rehash(sch);
 617         spin_unlock(root_lock);
 618
 619         if (q->perturb_period)
 620                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 621 }
 622
 623 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 624 {
 625         struct sfq_sched_data *q = qdisc_priv(sch);
 626         struct tc_sfq_qopt *ctl = nla_data(opt);
 627         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 628         unsigned int qlen, dropped = 0;
 629         struct red_parms *p = NULL;
 630         struct sk_buff *to_free = NULL;
 631         struct sk_buff *tail = NULL;
 632
 633         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 634                 return -EINVAL;
 635         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 636                 ctl_v1 = nla_data(opt);
 637         if (ctl->divisor &&
 638             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 639                 return -EINVAL;
 640         if (ctl_v1 && ctl_v1->qth_min) {
 641                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 642                 if (!p)
 643                         return -ENOMEM;
 644         }
 645         sch_tree_lock(sch);
 646         if (ctl->quantum) {
 647                 q->quantum = ctl->quantum;
 648                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 649         }
 650         q->perturb_period = ctl->perturb_period * HZ;
 651         if (ctl->flows)
 652                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 653         if (ctl->divisor) {
 654                 q->divisor = ctl->divisor;
 655                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 656         }
 657         if (ctl_v1) {
 658                 if (ctl_v1->depth)
 659                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 660                 if (p) {
 661                         swap(q->red_parms, p);
 662                         red_set_parms(q->red_parms,
 663                                       ctl_v1->qth_min, ctl_v1->qth_max,
 664                                       ctl_v1->Wlog,
 665                                       ctl_v1->Plog, ctl_v1->Scell_log,
 666                                       NULL,
 667                                       ctl_v1->max_P);
 668                 }
 669                 q->flags = ctl_v1->flags;
 670                 q->headdrop = ctl_v1->headdrop;
 671         }
 672         if (ctl->limit) {
 673                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 674                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 675         }
 676
 677         qlen = sch->q.qlen;
 678         while (sch->q.qlen > q->limit) {
 679                 dropped += sfq_drop(sch, &to_free);
 680                 if (!tail)
 681                         tail = to_free;
 682         }
 683
 684         rtnl_kfree_skbs(to_free, tail);
 685         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 686
 687         del_timer(&q->perturb_timer);
 688         if (q->perturb_period) {
 689                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 690                 q->perturbation = prandom_u32();
 691         }
 692         sch_tree_unlock(sch);
 693         kfree(p);
 694         return 0;
 695 }
 696
 697 static void *sfq_alloc(size_t sz)
 698 {
 699         return  kvmalloc(sz, GFP_KERNEL);
 700 }
 701
 702 static void sfq_free(void *addr)
 703 {
 704         kvfree(addr);
 705 }
 706
 707 static void sfq_destroy(struct Qdisc *sch)
 708 {
 709         struct sfq_sched_data *q = qdisc_priv(sch);
 710
 711         tcf_block_put(q->block);
 712         q->perturb_period = 0;
 713         del_timer_sync(&q->perturb_timer);
 714         sfq_free(q->ht);
 715         sfq_free(q->slots);
 716         kfree(q->red_parms);
 717 }
 718
 719 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 720 {
 721         struct sfq_sched_data *q = qdisc_priv(sch);
 722         int i;
 723         int err;
 724
 725         setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
 726                                (unsigned long)sch);
 727
 728         err = tcf_block_get(&q->block, &q->filter_list);
 729         if (err)
 730                 return err;
 731
 732         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 733                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 734                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 735         }
 736
 737         q->limit = SFQ_MAX_DEPTH;
 738         q->maxdepth = SFQ_MAX_DEPTH;
 739         q->cur_depth = 0;
 740         q->tail = NULL;
 741         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 742         q->maxflows = SFQ_DEFAULT_FLOWS;
 743         q->quantum = psched_mtu(qdisc_dev(sch));
 744         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 745         q->perturb_period = 0;
 746         q->perturbation = prandom_u32();
 747
 748         if (opt) {
 749                 int err = sfq_change(sch, opt);
 750                 if (err)
 751                         return err;
 752         }
 753
 754         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 755         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 756         if (!q->ht || !q->slots) {
 757                 /* Note: sfq_destroy() will be called by our caller */
 758                 return -ENOMEM;
 759         }
 760
 761         for (i = 0; i < q->divisor; i++)
 762                 q->ht[i] = SFQ_EMPTY_SLOT;
 763
 764         for (i = 0; i < q->maxflows; i++) {
 765                 slot_queue_init(&q->slots[i]);
 766                 sfq_link(q, i);
 767         }
 768         if (q->limit >= 1)
 769                 sch->flags |= TCQ_F_CAN_BYPASS;
 770         else
 771                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 772         return 0;
 773 }
 774
 775 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 776 {
 777         struct sfq_sched_data *q = qdisc_priv(sch);
 778         unsigned char *b = skb_tail_pointer(skb);
 779         struct tc_sfq_qopt_v1 opt;
 780         struct red_parms *p = q->red_parms;
 781
 782         memset(&opt, 0, sizeof(opt));
 783         opt.v0.quantum  = q->quantum;
 784         opt.v0.perturb_period = q->perturb_period / HZ;
 785         opt.v0.limit    = q->limit;
 786         opt.v0.divisor  = q->divisor;
 787         opt.v0.flows    = q->maxflows;
 788         opt.depth       = q->maxdepth;
 789         opt.headdrop    = q->headdrop;
 790
 791         if (p) {
 792                 opt.qth_min     = p->qth_min >> p->Wlog;
 793                 opt.qth_max     = p->qth_max >> p->Wlog;
 794                 opt.Wlog        = p->Wlog;
 795                 opt.Plog        = p->Plog;
 796                 opt.Scell_log   = p->Scell_log;
 797                 opt.max_P       = p->max_P;
 798         }
 799         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 800         opt.flags       = q->flags;
 801
 802         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 803                 goto nla_put_failure;
 804
 805         return skb->len;
 806
 807 nla_put_failure:
 808         nlmsg_trim(skb, b);
 809         return -1;
 810 }
 811
 812 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 813 {
 814         return NULL;
 815 }
 816
 817 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 818 {
 819         return 0;
 820 }
 821
 822 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 823                               u32 classid)
 824 {
 825         /* we cannot bypass queue discipline anymore */
 826         sch->flags &= ~TCQ_F_CAN_BYPASS;
 827         return 0;
 828 }
 829
 830 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 831 {
 832 }
 833
 834 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
 835 {
 836         struct sfq_sched_data *q = qdisc_priv(sch);
 837
 838         if (cl)
 839                 return NULL;
 840         return q->block;
 841 }
 842
 843 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 844                           struct sk_buff *skb, struct tcmsg *tcm)
 845 {
 846         tcm->tcm_handle |= TC_H_MIN(cl);
 847         return 0;
 848 }
 849
 850 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 851                                 struct gnet_dump *d)
 852 {
 853         struct sfq_sched_data *q = qdisc_priv(sch);
 854         sfq_index idx = q->ht[cl - 1];
 855         struct gnet_stats_queue qs = { 0 };
 856         struct tc_sfq_xstats xstats = { 0 };
 857
 858         if (idx != SFQ_EMPTY_SLOT) {
 859                 const struct sfq_slot *slot = &q->slots[idx];
 860
 861                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 862                 qs.qlen = slot->qlen;
 863                 qs.backlog = slot->backlog;
 864         }
 865         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 866                 return -1;
 867         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 868 }
 869
 870 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 871 {
 872         struct sfq_sched_data *q = qdisc_priv(sch);
 873         unsigned int i;
 874
 875         if (arg->stop)
 876                 return;
 877
 878         for (i = 0; i < q->divisor; i++) {
 879                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 880                     arg->count < arg->skip) {
 881                         arg->count++;
 882                         continue;
 883                 }
 884                 if (arg->fn(sch, i + 1, arg) < 0) {
 885                         arg->stop = 1;
 886                         break;
 887                 }
 888                 arg->count++;
 889         }
 890 }
 891
 892 static const struct Qdisc_class_ops sfq_class_ops = {
 893         .leaf           =       sfq_leaf,
 894         .find           =       sfq_find,
 895         .tcf_block      =       sfq_tcf_block,
 896         .bind_tcf       =       sfq_bind,
 897         .unbind_tcf     =       sfq_unbind,
 898         .dump           =       sfq_dump_class,
 899         .dump_stats     =       sfq_dump_class_stats,
 900         .walk           =       sfq_walk,
 901 };
 902
 903 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 904         .cl_ops         =       &sfq_class_ops,
 905         .id             =       "sfq",
 906         .priv_size      =       sizeof(struct sfq_sched_data),
 907         .enqueue        =       sfq_enqueue,
 908         .dequeue        =       sfq_dequeue,
 909         .peek           =       qdisc_peek_dequeued,
 910         .init           =       sfq_init,
 911         .reset          =       sfq_reset,
 912         .destroy        =       sfq_destroy,
 913         .change         =       NULL,
 914         .dump           =       sfq_dump,
 915         .owner          =       THIS_MODULE,
 916 };
 917
 918 static int __init sfq_module_init(void)
 919 {
 920         return register_qdisc(&sfq_qdisc_ops);
 921 }
 922 static void __exit sfq_module_exit(void)
 923 {
 924         unregister_qdisc(&sfq_qdisc_ops);
 925 }
 926 module_init(sfq_module_init)
 927 module_exit(sfq_module_exit)
 928 MODULE_LICENSE("GPL");