Merge tag 'v3.10.108' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / tcp_cong.c
CommitLineData
317a76f9
SH
1/*
2 * Plugable TCP congestion control support and newReno
3 * congestion control.
02582e9b 4 * Based on ideas from I/O scheduler support and Web100.
317a76f9
SH
5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */
8
afd46503
JP
9#define pr_fmt(fmt) "TCP: " fmt
10
317a76f9
SH
11#include <linux/module.h>
12#include <linux/mm.h>
13#include <linux/types.h>
14#include <linux/list.h>
5a0e3ad6 15#include <linux/gfp.h>
317a76f9
SH
16#include <net/tcp.h>
17
886236c1
JH
18int sysctl_tcp_max_ssthresh = 0;
19
317a76f9
SH
20static DEFINE_SPINLOCK(tcp_cong_list_lock);
21static LIST_HEAD(tcp_cong_list);
22
23/* Simple linear search, don't expect many entries! */
24static struct tcp_congestion_ops *tcp_ca_find(const char *name)
25{
26 struct tcp_congestion_ops *e;
27
5f8ef48d 28 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
317a76f9
SH
29 if (strcmp(e->name, name) == 0)
30 return e;
31 }
32
33 return NULL;
34}
35
36/*
d08df601 37 * Attach new congestion control algorithm to the list
317a76f9
SH
38 * of available options.
39 */
40int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
41{
42 int ret = 0;
43
44 /* all algorithms must implement ssthresh and cong_avoid ops */
72dc5b92 45 if (!ca->ssthresh || !ca->cong_avoid) {
afd46503 46 pr_err("%s does not implement required ops\n", ca->name);
317a76f9
SH
47 return -EINVAL;
48 }
49
50 spin_lock(&tcp_cong_list_lock);
51 if (tcp_ca_find(ca->name)) {
afd46503 52 pr_notice("%s already registered\n", ca->name);
317a76f9
SH
53 ret = -EEXIST;
54 } else {
3d2573f7 55 list_add_tail_rcu(&ca->list, &tcp_cong_list);
afd46503 56 pr_info("%s registered\n", ca->name);
317a76f9
SH
57 }
58 spin_unlock(&tcp_cong_list_lock);
59
60 return ret;
61}
62EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
63
64/*
65 * Remove congestion control algorithm, called from
66 * the module's remove function. Module ref counts are used
67 * to ensure that this can't be done till all sockets using
68 * that method are closed.
69 */
70void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
71{
72 spin_lock(&tcp_cong_list_lock);
73 list_del_rcu(&ca->list);
74 spin_unlock(&tcp_cong_list_lock);
75}
76EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
77
78/* Assign choice of congestion control. */
6687e988 79void tcp_init_congestion_control(struct sock *sk)
317a76f9 80{
6687e988 81 struct inet_connection_sock *icsk = inet_csk(sk);
317a76f9
SH
82 struct tcp_congestion_ops *ca;
83
4d4d3d1e
SH
84 /* if no choice made yet assign the current value set as default */
85 if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
86 rcu_read_lock();
87 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
88 if (try_module_get(ca->owner)) {
89 icsk->icsk_ca_ops = ca;
90 break;
91 }
5f8ef48d 92
4d4d3d1e 93 /* fallback to next available */
317a76f9 94 }
4d4d3d1e 95 rcu_read_unlock();
317a76f9 96 }
317a76f9 97
946272f8 98 tcp_sk(sk)->prior_ssthresh = 0;
6687e988
ACM
99 if (icsk->icsk_ca_ops->init)
100 icsk->icsk_ca_ops->init(sk);
317a76f9
SH
101}
102
103/* Manage refcounts on socket close. */
6687e988 104void tcp_cleanup_congestion_control(struct sock *sk)
317a76f9 105{
6687e988
ACM
106 struct inet_connection_sock *icsk = inet_csk(sk);
107
108 if (icsk->icsk_ca_ops->release)
109 icsk->icsk_ca_ops->release(sk);
110 module_put(icsk->icsk_ca_ops->owner);
317a76f9
SH
111}
112
113/* Used by sysctl to change default congestion control */
114int tcp_set_default_congestion_control(const char *name)
115{
116 struct tcp_congestion_ops *ca;
117 int ret = -ENOENT;
118
119 spin_lock(&tcp_cong_list_lock);
120 ca = tcp_ca_find(name);
95a5afca 121#ifdef CONFIG_MODULES
a8f80e8f 122 if (!ca && capable(CAP_NET_ADMIN)) {
317a76f9
SH
123 spin_unlock(&tcp_cong_list_lock);
124
125 request_module("tcp_%s", name);
126 spin_lock(&tcp_cong_list_lock);
127 ca = tcp_ca_find(name);
128 }
129#endif
130
131 if (ca) {
164891aa 132 ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
317a76f9
SH
133 list_move(&ca->list, &tcp_cong_list);
134 ret = 0;
135 }
136 spin_unlock(&tcp_cong_list_lock);
137
138 return ret;
139}
140
b1736a71
SH
141/* Set default value from kernel configuration at bootup */
142static int __init tcp_congestion_default(void)
143{
144 return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
145}
146late_initcall(tcp_congestion_default);
147
148
3ff825b2
SH
149/* Build string with list of available congestion control values */
150void tcp_get_available_congestion_control(char *buf, size_t maxlen)
151{
152 struct tcp_congestion_ops *ca;
153 size_t offs = 0;
154
155 rcu_read_lock();
156 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
157 offs += snprintf(buf + offs, maxlen - offs,
158 "%s%s",
159 offs == 0 ? "" : " ", ca->name);
160
161 }
162 rcu_read_unlock();
163}
164
317a76f9
SH
165/* Get current default congestion control */
166void tcp_get_default_congestion_control(char *name)
167{
168 struct tcp_congestion_ops *ca;
169 /* We will always have reno... */
170 BUG_ON(list_empty(&tcp_cong_list));
171
172 rcu_read_lock();
173 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
174 strncpy(name, ca->name, TCP_CA_NAME_MAX);
175 rcu_read_unlock();
176}
177
ce7bc3bf
SH
178/* Built list of non-restricted congestion control values */
179void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
180{
181 struct tcp_congestion_ops *ca;
182 size_t offs = 0;
183
184 *buf = '\0';
185 rcu_read_lock();
186 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
164891aa 187 if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
ce7bc3bf
SH
188 continue;
189 offs += snprintf(buf + offs, maxlen - offs,
190 "%s%s",
191 offs == 0 ? "" : " ", ca->name);
192
193 }
194 rcu_read_unlock();
195}
196
197/* Change list of non-restricted congestion control */
198int tcp_set_allowed_congestion_control(char *val)
199{
200 struct tcp_congestion_ops *ca;
c34186ed 201 char *saved_clone, *clone, *name;
ce7bc3bf
SH
202 int ret = 0;
203
c34186ed 204 saved_clone = clone = kstrdup(val, GFP_USER);
ce7bc3bf
SH
205 if (!clone)
206 return -ENOMEM;
207
208 spin_lock(&tcp_cong_list_lock);
209 /* pass 1 check for bad entries */
210 while ((name = strsep(&clone, " ")) && *name) {
211 ca = tcp_ca_find(name);
212 if (!ca) {
213 ret = -ENOENT;
214 goto out;
215 }
216 }
217
164891aa 218 /* pass 2 clear old values */
ce7bc3bf 219 list_for_each_entry_rcu(ca, &tcp_cong_list, list)
164891aa 220 ca->flags &= ~TCP_CONG_NON_RESTRICTED;
ce7bc3bf
SH
221
222 /* pass 3 mark as allowed */
223 while ((name = strsep(&val, " ")) && *name) {
224 ca = tcp_ca_find(name);
225 WARN_ON(!ca);
226 if (ca)
164891aa 227 ca->flags |= TCP_CONG_NON_RESTRICTED;
ce7bc3bf
SH
228 }
229out:
230 spin_unlock(&tcp_cong_list_lock);
c34186ed 231 kfree(saved_clone);
ce7bc3bf
SH
232
233 return ret;
234}
235
236
5f8ef48d 237/* Change congestion control for socket */
6687e988 238int tcp_set_congestion_control(struct sock *sk, const char *name)
5f8ef48d 239{
6687e988 240 struct inet_connection_sock *icsk = inet_csk(sk);
5f8ef48d
SH
241 struct tcp_congestion_ops *ca;
242 int err = 0;
243
244 rcu_read_lock();
245 ca = tcp_ca_find(name);
4d4d3d1e 246
35bfbc94 247 /* no change asking for existing value */
6687e988 248 if (ca == icsk->icsk_ca_ops)
5f8ef48d
SH
249 goto out;
250
95a5afca 251#ifdef CONFIG_MODULES
35bfbc94 252 /* not found attempt to autoload module */
a8f80e8f 253 if (!ca && capable(CAP_NET_ADMIN)) {
35bfbc94
SH
254 rcu_read_unlock();
255 request_module("tcp_%s", name);
256 rcu_read_lock();
257 ca = tcp_ca_find(name);
258 }
259#endif
5f8ef48d
SH
260 if (!ca)
261 err = -ENOENT;
262
52e804c6
EB
263 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
264 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
ce7bc3bf
SH
265 err = -EPERM;
266
5f8ef48d
SH
267 else if (!try_module_get(ca->owner))
268 err = -EBUSY;
269
270 else {
6687e988
ACM
271 tcp_cleanup_congestion_control(sk);
272 icsk->icsk_ca_ops = ca;
4d4d3d1e
SH
273
274 if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
6687e988 275 icsk->icsk_ca_ops->init(sk);
5f8ef48d
SH
276 }
277 out:
278 rcu_read_unlock();
279 return err;
280}
281
cea14e0e
IJ
282/* RFC2861 Check whether we are limited by application or congestion window
283 * This is the inverse of cwnd check in tcp_tso_should_defer
284 */
a2a385d6 285bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
cea14e0e
IJ
286{
287 const struct tcp_sock *tp = tcp_sk(sk);
288 u32 left;
289
290 if (in_flight >= tp->snd_cwnd)
a2a385d6 291 return true;
cea14e0e 292
cea14e0e 293 left = tp->snd_cwnd - in_flight;
ce447eb9 294 if (sk_can_gso(sk) &&
246eb2af 295 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
1485348d
BH
296 left * tp->mss_cache < sk->sk_gso_max_size &&
297 left < sk->sk_gso_max_segs)
a2a385d6 298 return true;
6b5a5c0d 299 return left <= tcp_max_tso_deferred_mss(tp);
cea14e0e
IJ
300}
301EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
40efc6fa
SH
302
303/*
a02ba041
SH
304 * Slow start is used when congestion window is less than slow start
305 * threshold. This version implements the basic RFC2581 version
306 * and optionally supports:
307 * RFC3742 Limited Slow Start - growth limited to max_ssthresh
308 * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
40efc6fa
SH
309 */
310void tcp_slow_start(struct tcp_sock *tp)
311{
a02ba041 312 int cnt; /* increase in packets */
9dc27415 313 unsigned int delta = 0;
973ec449
ED
314 u32 snd_cwnd = tp->snd_cwnd;
315
316 if (unlikely(!snd_cwnd)) {
317 pr_err_once("snd_cwnd is nul, please report this bug.\n");
318 snd_cwnd = 1U;
319 }
a02ba041 320
a02ba041
SH
321 if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
322 cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
886236c1 323 else
973ec449 324 cnt = snd_cwnd; /* exponential increase */
886236c1 325
886236c1 326 tp->snd_cwnd_cnt += cnt;
973ec449
ED
327 while (tp->snd_cwnd_cnt >= snd_cwnd) {
328 tp->snd_cwnd_cnt -= snd_cwnd;
9dc27415 329 delta++;
886236c1 330 }
973ec449 331 tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp);
40efc6fa
SH
332}
333EXPORT_SYMBOL_GPL(tcp_slow_start);
334
758ce5c8
IJ
335/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
336void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
337{
338 if (tp->snd_cwnd_cnt >= w) {
339 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
340 tp->snd_cwnd++;
341 tp->snd_cwnd_cnt = 0;
342 } else {
343 tp->snd_cwnd_cnt++;
344 }
345}
346EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
347
317a76f9
SH
348/*
349 * TCP Reno congestion control
350 * This is special case used for fallback as well.
351 */
352/* This is Jacobson's slow start and congestion avoidance.
353 * SIGCOMM '88, p. 328.
354 */
c3a05c60 355void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
317a76f9 356{
6687e988
ACM
357 struct tcp_sock *tp = tcp_sk(sk);
358
f4805ede 359 if (!tcp_is_cwnd_limited(sk, in_flight))
317a76f9
SH
360 return;
361
7faffa1c 362 /* In "safe" area, increase. */
e905a9ed 363 if (tp->snd_cwnd <= tp->snd_ssthresh)
7faffa1c 364 tcp_slow_start(tp);
e905a9ed 365 /* In dangerous area, increase slowly. */
ca2eb567 366 else
758ce5c8 367 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
317a76f9
SH
368}
369EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
370
371/* Slow start threshold is half the congestion window (min 2) */
6687e988 372u32 tcp_reno_ssthresh(struct sock *sk)
317a76f9 373{
6687e988 374 const struct tcp_sock *tp = tcp_sk(sk);
317a76f9
SH
375 return max(tp->snd_cwnd >> 1U, 2U);
376}
377EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
378
72dc5b92
SH
379/* Lower bound on congestion window with halving. */
380u32 tcp_reno_min_cwnd(const struct sock *sk)
317a76f9 381{
6687e988 382 const struct tcp_sock *tp = tcp_sk(sk);
317a76f9
SH
383 return tp->snd_ssthresh/2;
384}
385EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
386
387struct tcp_congestion_ops tcp_reno = {
164891aa 388 .flags = TCP_CONG_NON_RESTRICTED,
317a76f9
SH
389 .name = "reno",
390 .owner = THIS_MODULE,
391 .ssthresh = tcp_reno_ssthresh,
392 .cong_avoid = tcp_reno_cong_avoid,
393 .min_cwnd = tcp_reno_min_cwnd,
394};
395
5f8ef48d
SH
396/* Initial congestion control used (until SYN)
397 * really reno under another name so we can tell difference
398 * during tcp_set_default_congestion_control
399 */
400struct tcp_congestion_ops tcp_init_congestion_ops = {
401 .name = "",
402 .owner = THIS_MODULE,
403 .ssthresh = tcp_reno_ssthresh,
404 .cong_avoid = tcp_reno_cong_avoid,
405 .min_cwnd = tcp_reno_min_cwnd,
406};
407EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);