2 * IPVS: Locality-Based Least-Connection scheduling module
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 * Martin Hamilton : fixed the terrible locking bugs
13 * *lock(tbl->lock) ==> *lock(&tbl->lock)
14 * Wensong Zhang : fixed the uninitialized tbl->lock bug
15 * Wensong Zhang : added doing full expiration check to
16 * collect stale entries of 24+ hours when
17 * no partial expire check in a half hour
18 * Julian Anastasov : replaced del_timer call with del_timer_sync
19 * to avoid the possible race between timer
20 * handler and del_timer thread in SMP
25 * The lblc algorithm is as follows (pseudo code):
27 * if cachenode[dest_ip] is null then
28 * n, cachenode[dest_ip] <- {weighted least-conn node};
30 * n <- cachenode[dest_ip];
32 * (n.conns>n.weight AND
33 * there is a node m with m.conns<m.weight/2) then
34 * n, cachenode[dest_ip] <- {weighted least-conn node};
38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
39 * me to write this module.
42 #define KMSG_COMPONENT "IPVS"
43 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
46 #include <linux/slab.h>
47 #include <linux/module.h>
48 #include <linux/kernel.h>
49 #include <linux/skbuff.h>
50 #include <linux/jiffies.h>
54 #include <linux/sysctl.h>
56 #include <net/ip_vs.h>
60 * It is for garbage collection of stale IPVS lblc entries,
61 * when the table is full.
63 #define CHECK_EXPIRE_INTERVAL (60*HZ)
64 #define ENTRY_TIMEOUT (6*60*HZ)
66 #define DEFAULT_EXPIRATION (24*60*60*HZ)
69 * It is for full expiration check.
70 * When there is no partial expiration check (garbage collection)
71 * in a half hour, do a full expiration check to collect stale
72 * entries that haven't been touched for a day.
74 #define COUNT_FOR_FULL_EXPIRATION 30
78 * for IPVS lblc entry hash table
80 #ifndef CONFIG_IP_VS_LBLC_TAB_BITS
81 #define CONFIG_IP_VS_LBLC_TAB_BITS 10
83 #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
84 #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
85 #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
89 * IPVS lblc entry represents an association between destination
90 * IP address and its destination server
92 struct ip_vs_lblc_entry
{
93 struct list_head list
;
94 int af
; /* address family */
95 union nf_inet_addr addr
; /* destination IP address */
96 struct ip_vs_dest
*dest
; /* real server (cache) */
97 unsigned long lastuse
; /* last used time */
102 * IPVS lblc hash table
104 struct ip_vs_lblc_table
{
105 struct list_head bucket
[IP_VS_LBLC_TAB_SIZE
]; /* hash bucket */
106 atomic_t entries
; /* number of entries */
107 int max_size
; /* maximum size of entries */
108 struct timer_list periodic_timer
; /* collect stale entries */
109 int rover
; /* rover for expire check */
110 int counter
; /* counter for no expire */
115 * IPVS LBLC sysctl table
118 static ctl_table vs_vars_table
[] = {
120 .procname
= "lblc_expiration",
122 .maxlen
= sizeof(int),
124 .proc_handler
= proc_dointvec_jiffies
,
130 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry
*en
)
134 * We don't kfree dest because it is referred either by its service
135 * or the trash dest list.
137 atomic_dec(&en
->dest
->refcnt
);
143 * Returns hash value for IPVS LBLC entry
145 static inline unsigned
146 ip_vs_lblc_hashkey(int af
, const union nf_inet_addr
*addr
)
148 __be32 addr_fold
= addr
->ip
;
150 #ifdef CONFIG_IP_VS_IPV6
152 addr_fold
= addr
->ip6
[0]^addr
->ip6
[1]^
153 addr
->ip6
[2]^addr
->ip6
[3];
155 return (ntohl(addr_fold
)*2654435761UL) & IP_VS_LBLC_TAB_MASK
;
160 * Hash an entry in the ip_vs_lblc_table.
161 * returns bool success.
164 ip_vs_lblc_hash(struct ip_vs_lblc_table
*tbl
, struct ip_vs_lblc_entry
*en
)
166 unsigned hash
= ip_vs_lblc_hashkey(en
->af
, &en
->addr
);
168 list_add(&en
->list
, &tbl
->bucket
[hash
]);
169 atomic_inc(&tbl
->entries
);
174 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
177 static inline struct ip_vs_lblc_entry
*
178 ip_vs_lblc_get(int af
, struct ip_vs_lblc_table
*tbl
,
179 const union nf_inet_addr
*addr
)
181 unsigned hash
= ip_vs_lblc_hashkey(af
, addr
);
182 struct ip_vs_lblc_entry
*en
;
184 list_for_each_entry(en
, &tbl
->bucket
[hash
], list
)
185 if (ip_vs_addr_equal(af
, &en
->addr
, addr
))
193 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
194 * address to a server. Called under write lock.
196 static inline struct ip_vs_lblc_entry
*
197 ip_vs_lblc_new(struct ip_vs_lblc_table
*tbl
, const union nf_inet_addr
*daddr
,
198 struct ip_vs_dest
*dest
)
200 struct ip_vs_lblc_entry
*en
;
202 en
= ip_vs_lblc_get(dest
->af
, tbl
, daddr
);
204 en
= kmalloc(sizeof(*en
), GFP_ATOMIC
);
209 ip_vs_addr_copy(dest
->af
, &en
->addr
, daddr
);
210 en
->lastuse
= jiffies
;
212 atomic_inc(&dest
->refcnt
);
215 ip_vs_lblc_hash(tbl
, en
);
216 } else if (en
->dest
!= dest
) {
217 atomic_dec(&en
->dest
->refcnt
);
218 atomic_inc(&dest
->refcnt
);
227 * Flush all the entries of the specified table.
229 static void ip_vs_lblc_flush(struct ip_vs_lblc_table
*tbl
)
231 struct ip_vs_lblc_entry
*en
, *nxt
;
234 for (i
=0; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
235 list_for_each_entry_safe(en
, nxt
, &tbl
->bucket
[i
], list
) {
237 atomic_dec(&tbl
->entries
);
242 static int sysctl_lblc_expiration(struct ip_vs_service
*svc
)
245 struct netns_ipvs
*ipvs
= net_ipvs(svc
->net
);
246 return ipvs
->sysctl_lblc_expiration
;
248 return DEFAULT_EXPIRATION
;
252 static inline void ip_vs_lblc_full_check(struct ip_vs_service
*svc
)
254 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
255 struct ip_vs_lblc_entry
*en
, *nxt
;
256 unsigned long now
= jiffies
;
259 for (i
=0, j
=tbl
->rover
; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
260 j
= (j
+ 1) & IP_VS_LBLC_TAB_MASK
;
262 write_lock(&svc
->sched_lock
);
263 list_for_each_entry_safe(en
, nxt
, &tbl
->bucket
[j
], list
) {
266 sysctl_lblc_expiration(svc
)))
270 atomic_dec(&tbl
->entries
);
272 write_unlock(&svc
->sched_lock
);
279 * Periodical timer handler for IPVS lblc table
280 * It is used to collect stale entries when the number of entries
281 * exceeds the maximum size of the table.
283 * Fixme: we probably need more complicated algorithm to collect
284 * entries that have not been used for a long time even
285 * if the number of entries doesn't exceed the maximum size
287 * The full expiration check is for this purpose now.
289 static void ip_vs_lblc_check_expire(unsigned long data
)
291 struct ip_vs_service
*svc
= (struct ip_vs_service
*) data
;
292 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
293 unsigned long now
= jiffies
;
296 struct ip_vs_lblc_entry
*en
, *nxt
;
298 if ((tbl
->counter
% COUNT_FOR_FULL_EXPIRATION
) == 0) {
299 /* do full expiration check */
300 ip_vs_lblc_full_check(svc
);
305 if (atomic_read(&tbl
->entries
) <= tbl
->max_size
) {
310 goal
= (atomic_read(&tbl
->entries
) - tbl
->max_size
)*4/3;
311 if (goal
> tbl
->max_size
/2)
312 goal
= tbl
->max_size
/2;
314 for (i
=0, j
=tbl
->rover
; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
315 j
= (j
+ 1) & IP_VS_LBLC_TAB_MASK
;
317 write_lock(&svc
->sched_lock
);
318 list_for_each_entry_safe(en
, nxt
, &tbl
->bucket
[j
], list
) {
319 if (time_before(now
, en
->lastuse
+ ENTRY_TIMEOUT
))
323 atomic_dec(&tbl
->entries
);
326 write_unlock(&svc
->sched_lock
);
333 mod_timer(&tbl
->periodic_timer
, jiffies
+CHECK_EXPIRE_INTERVAL
);
337 static int ip_vs_lblc_init_svc(struct ip_vs_service
*svc
)
340 struct ip_vs_lblc_table
*tbl
;
343 * Allocate the ip_vs_lblc_table for this service
345 tbl
= kmalloc(sizeof(*tbl
), GFP_ATOMIC
);
349 svc
->sched_data
= tbl
;
350 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
351 "current service\n", sizeof(*tbl
));
354 * Initialize the hash buckets
356 for (i
=0; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
357 INIT_LIST_HEAD(&tbl
->bucket
[i
]);
359 tbl
->max_size
= IP_VS_LBLC_TAB_SIZE
*16;
364 * Hook periodic timer for garbage collection
366 setup_timer(&tbl
->periodic_timer
, ip_vs_lblc_check_expire
,
368 mod_timer(&tbl
->periodic_timer
, jiffies
+ CHECK_EXPIRE_INTERVAL
);
374 static int ip_vs_lblc_done_svc(struct ip_vs_service
*svc
)
376 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
378 /* remove periodic timer */
379 del_timer_sync(&tbl
->periodic_timer
);
381 /* got to clean up table entries here */
382 ip_vs_lblc_flush(tbl
);
384 /* release the table itself */
386 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
393 static inline struct ip_vs_dest
*
394 __ip_vs_lblc_schedule(struct ip_vs_service
*svc
)
396 struct ip_vs_dest
*dest
, *least
;
400 * We use the following formula to estimate the load:
401 * (dest overhead) / dest->weight
403 * Remember -- no floats in kernel mode!!!
404 * The comparison of h1*w2 > h2*w1 is equivalent to that of
406 * if every weight is larger than zero.
408 * The server with weight=0 is quiesced and will not receive any
411 list_for_each_entry(dest
, &svc
->destinations
, n_list
) {
412 if (dest
->flags
& IP_VS_DEST_F_OVERLOAD
)
414 if (atomic_read(&dest
->weight
) > 0) {
416 loh
= ip_vs_dest_conn_overhead(least
);
423 * Find the destination with the least load.
426 list_for_each_entry_continue(dest
, &svc
->destinations
, n_list
) {
427 if (dest
->flags
& IP_VS_DEST_F_OVERLOAD
)
430 doh
= ip_vs_dest_conn_overhead(dest
);
431 if (loh
* atomic_read(&dest
->weight
) >
432 doh
* atomic_read(&least
->weight
)) {
438 IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
439 "activeconns %d refcnt %d weight %d overhead %d\n",
440 IP_VS_DBG_ADDR(least
->af
, &least
->addr
),
442 atomic_read(&least
->activeconns
),
443 atomic_read(&least
->refcnt
),
444 atomic_read(&least
->weight
), loh
);
451 * If this destination server is overloaded and there is a less loaded
452 * server, then return true.
455 is_overloaded(struct ip_vs_dest
*dest
, struct ip_vs_service
*svc
)
457 if (atomic_read(&dest
->activeconns
) > atomic_read(&dest
->weight
)) {
458 struct ip_vs_dest
*d
;
460 list_for_each_entry(d
, &svc
->destinations
, n_list
) {
461 if (atomic_read(&d
->activeconns
)*2
462 < atomic_read(&d
->weight
)) {
472 * Locality-Based (weighted) Least-Connection scheduling
474 static struct ip_vs_dest
*
475 ip_vs_lblc_schedule(struct ip_vs_service
*svc
, const struct sk_buff
*skb
)
477 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
478 struct ip_vs_iphdr iph
;
479 struct ip_vs_dest
*dest
= NULL
;
480 struct ip_vs_lblc_entry
*en
;
482 ip_vs_fill_iphdr(svc
->af
, skb_network_header(skb
), &iph
);
484 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__
);
486 /* First look in our cache */
487 read_lock(&svc
->sched_lock
);
488 en
= ip_vs_lblc_get(svc
->af
, tbl
, &iph
.daddr
);
490 /* We only hold a read lock, but this is atomic */
491 en
->lastuse
= jiffies
;
494 * If the destination is not available, i.e. it's in the trash,
495 * we must ignore it, as it may be removed from under our feet,
496 * if someone drops our reference count. Our caller only makes
497 * sure that destinations, that are not in the trash, are not
498 * moved to the trash, while we are scheduling. But anyone can
499 * free up entries from the trash at any time.
502 if (en
->dest
->flags
& IP_VS_DEST_F_AVAILABLE
)
505 read_unlock(&svc
->sched_lock
);
507 /* If the destination has a weight and is not overloaded, use it */
508 if (dest
&& atomic_read(&dest
->weight
) > 0 && !is_overloaded(dest
, svc
))
511 /* No cache entry or it is invalid, time to schedule */
512 dest
= __ip_vs_lblc_schedule(svc
);
514 ip_vs_scheduler_err(svc
, "no destination available");
518 /* If we fail to create a cache entry, we'll just use the valid dest */
519 write_lock(&svc
->sched_lock
);
520 ip_vs_lblc_new(tbl
, &iph
.daddr
, dest
);
521 write_unlock(&svc
->sched_lock
);
524 IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
525 IP_VS_DBG_ADDR(svc
->af
, &iph
.daddr
),
526 IP_VS_DBG_ADDR(svc
->af
, &dest
->addr
), ntohs(dest
->port
));
533 * IPVS LBLC Scheduler structure
535 static struct ip_vs_scheduler ip_vs_lblc_scheduler
=
538 .refcnt
= ATOMIC_INIT(0),
539 .module
= THIS_MODULE
,
540 .n_list
= LIST_HEAD_INIT(ip_vs_lblc_scheduler
.n_list
),
541 .init_service
= ip_vs_lblc_init_svc
,
542 .done_service
= ip_vs_lblc_done_svc
,
543 .schedule
= ip_vs_lblc_schedule
,
550 static int __net_init
__ip_vs_lblc_init(struct net
*net
)
552 struct netns_ipvs
*ipvs
= net_ipvs(net
);
554 if (!net_eq(net
, &init_net
)) {
555 ipvs
->lblc_ctl_table
= kmemdup(vs_vars_table
,
556 sizeof(vs_vars_table
),
558 if (ipvs
->lblc_ctl_table
== NULL
)
561 ipvs
->lblc_ctl_table
= vs_vars_table
;
562 ipvs
->sysctl_lblc_expiration
= DEFAULT_EXPIRATION
;
563 ipvs
->lblc_ctl_table
[0].data
= &ipvs
->sysctl_lblc_expiration
;
565 ipvs
->lblc_ctl_header
=
566 register_net_sysctl_table(net
, net_vs_ctl_path
,
567 ipvs
->lblc_ctl_table
);
568 if (!ipvs
->lblc_ctl_header
) {
569 if (!net_eq(net
, &init_net
))
570 kfree(ipvs
->lblc_ctl_table
);
577 static void __net_exit
__ip_vs_lblc_exit(struct net
*net
)
579 struct netns_ipvs
*ipvs
= net_ipvs(net
);
581 unregister_net_sysctl_table(ipvs
->lblc_ctl_header
);
583 if (!net_eq(net
, &init_net
))
584 kfree(ipvs
->lblc_ctl_table
);
589 static int __net_init
__ip_vs_lblc_init(struct net
*net
) { return 0; }
590 static void __net_exit
__ip_vs_lblc_exit(struct net
*net
) { }
594 static struct pernet_operations ip_vs_lblc_ops
= {
595 .init
= __ip_vs_lblc_init
,
596 .exit
= __ip_vs_lblc_exit
,
599 static int __init
ip_vs_lblc_init(void)
603 ret
= register_pernet_subsys(&ip_vs_lblc_ops
);
607 ret
= register_ip_vs_scheduler(&ip_vs_lblc_scheduler
);
609 unregister_pernet_subsys(&ip_vs_lblc_ops
);
613 static void __exit
ip_vs_lblc_cleanup(void)
615 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler
);
616 unregister_pernet_subsys(&ip_vs_lblc_ops
);
620 module_init(ip_vs_lblc_init
);
621 module_exit(ip_vs_lblc_cleanup
);
622 MODULE_LICENSE("GPL");