Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | /* |
2 | * background writeback - scan btree for dirty data and write it to the backing | |
3 | * device | |
4 | * | |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcache.h" | |
10 | #include "btree.h" | |
11 | #include "debug.h" | |
279afbad | 12 | #include "writeback.h" |
cafe5635 | 13 | |
5e6926da KO |
14 | #include <linux/delay.h> |
15 | #include <linux/freezer.h> | |
16 | #include <linux/kthread.h> | |
c37511b8 KO |
17 | #include <trace/events/bcache.h> |
18 | ||
cafe5635 KO |
19 | /* Rate limiting */ |
20 | ||
21 | static void __update_writeback_rate(struct cached_dev *dc) | |
22 | { | |
23 | struct cache_set *c = dc->disk.c; | |
24 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | |
25 | uint64_t cache_dirty_target = | |
26 | div_u64(cache_sectors * dc->writeback_percent, 100); | |
27 | ||
28 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | |
29 | c->cached_dev_sectors); | |
30 | ||
31 | /* PD controller */ | |
32 | ||
33 | int change = 0; | |
34 | int64_t error; | |
279afbad | 35 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
cafe5635 KO |
36 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
37 | ||
38 | dc->disk.sectors_dirty_last = dirty; | |
39 | ||
40 | derivative *= dc->writeback_rate_d_term; | |
41 | derivative = clamp(derivative, -dirty, dirty); | |
42 | ||
43 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | |
44 | dc->writeback_rate_d_smooth, 0); | |
45 | ||
46 | /* Avoid divide by zero */ | |
47 | if (!target) | |
48 | goto out; | |
49 | ||
50 | error = div64_s64((dirty + derivative - target) << 8, target); | |
51 | ||
52 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | |
53 | dc->writeback_rate_p_term_inverse); | |
54 | ||
55 | /* Don't increase writeback rate if the device isn't keeping up */ | |
56 | if (change > 0 && | |
57 | time_after64(local_clock(), | |
58 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | |
59 | change = 0; | |
60 | ||
61 | dc->writeback_rate.rate = | |
62 | clamp_t(int64_t, dc->writeback_rate.rate + change, | |
63 | 1, NSEC_PER_MSEC); | |
64 | out: | |
65 | dc->writeback_rate_derivative = derivative; | |
66 | dc->writeback_rate_change = change; | |
67 | dc->writeback_rate_target = target; | |
cafe5635 KO |
68 | } |
69 | ||
70 | static void update_writeback_rate(struct work_struct *work) | |
71 | { | |
72 | struct cached_dev *dc = container_of(to_delayed_work(work), | |
73 | struct cached_dev, | |
74 | writeback_rate_update); | |
75 | ||
76 | down_read(&dc->writeback_lock); | |
77 | ||
78 | if (atomic_read(&dc->has_dirty) && | |
79 | dc->writeback_percent) | |
80 | __update_writeback_rate(dc); | |
81 | ||
82 | up_read(&dc->writeback_lock); | |
5e6926da KO |
83 | |
84 | schedule_delayed_work(&dc->writeback_rate_update, | |
85 | dc->writeback_rate_update_seconds * HZ); | |
cafe5635 KO |
86 | } |
87 | ||
88 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | |
89 | { | |
c2a4f318 KO |
90 | uint64_t ret; |
91 | ||
cafe5635 KO |
92 | if (atomic_read(&dc->disk.detaching) || |
93 | !dc->writeback_percent) | |
94 | return 0; | |
95 | ||
c2a4f318 KO |
96 | ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
97 | ||
98 | return min_t(uint64_t, ret, HZ); | |
cafe5635 KO |
99 | } |
100 | ||
5e6926da KO |
101 | struct dirty_io { |
102 | struct closure cl; | |
103 | struct cached_dev *dc; | |
104 | struct bio bio; | |
105 | }; | |
72c27061 | 106 | |
cafe5635 KO |
107 | static void dirty_init(struct keybuf_key *w) |
108 | { | |
109 | struct dirty_io *io = w->private; | |
110 | struct bio *bio = &io->bio; | |
111 | ||
112 | bio_init(bio); | |
113 | if (!io->dc->writeback_percent) | |
114 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | |
115 | ||
116 | bio->bi_size = KEY_SIZE(&w->key) << 9; | |
117 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | |
118 | bio->bi_private = w; | |
119 | bio->bi_io_vec = bio->bi_inline_vecs; | |
169ef1cf | 120 | bch_bio_map(bio, NULL); |
cafe5635 KO |
121 | } |
122 | ||
cafe5635 KO |
123 | static void dirty_io_destructor(struct closure *cl) |
124 | { | |
125 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
126 | kfree(io); | |
127 | } | |
128 | ||
129 | static void write_dirty_finish(struct closure *cl) | |
130 | { | |
131 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
132 | struct keybuf_key *w = io->bio.bi_private; | |
133 | struct cached_dev *dc = io->dc; | |
8e51e414 KO |
134 | struct bio_vec *bv; |
135 | int i; | |
cafe5635 | 136 | |
8e51e414 | 137 | bio_for_each_segment_all(bv, &io->bio, i) |
cafe5635 KO |
138 | __free_page(bv->bv_page); |
139 | ||
140 | /* This is kind of a dumb way of signalling errors. */ | |
141 | if (KEY_DIRTY(&w->key)) { | |
142 | unsigned i; | |
143 | struct btree_op op; | |
0b93207a | 144 | struct keylist keys; |
6054c6d4 | 145 | int ret; |
0b93207a | 146 | |
b54d6934 | 147 | bch_btree_op_init(&op, -1); |
0b93207a | 148 | bch_keylist_init(&keys); |
cafe5635 | 149 | |
1b207d80 KO |
150 | bkey_copy(keys.top, &w->key); |
151 | SET_KEY_DIRTY(keys.top, false); | |
152 | bch_keylist_push(&keys); | |
cafe5635 KO |
153 | |
154 | for (i = 0; i < KEY_PTRS(&w->key); i++) | |
155 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | |
156 | ||
6054c6d4 | 157 | ret = bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key); |
cafe5635 | 158 | |
6054c6d4 | 159 | if (ret) |
c37511b8 KO |
160 | trace_bcache_writeback_collision(&w->key); |
161 | ||
6054c6d4 | 162 | atomic_long_inc(ret |
cafe5635 KO |
163 | ? &dc->disk.c->writeback_keys_failed |
164 | : &dc->disk.c->writeback_keys_done); | |
165 | } | |
166 | ||
167 | bch_keybuf_del(&dc->writeback_keys, w); | |
c2a4f318 | 168 | up(&dc->in_flight); |
cafe5635 KO |
169 | |
170 | closure_return_with_destructor(cl, dirty_io_destructor); | |
171 | } | |
172 | ||
173 | static void dirty_endio(struct bio *bio, int error) | |
174 | { | |
175 | struct keybuf_key *w = bio->bi_private; | |
176 | struct dirty_io *io = w->private; | |
177 | ||
178 | if (error) | |
179 | SET_KEY_DIRTY(&w->key, false); | |
180 | ||
181 | closure_put(&io->cl); | |
182 | } | |
183 | ||
184 | static void write_dirty(struct closure *cl) | |
185 | { | |
186 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
187 | struct keybuf_key *w = io->bio.bi_private; | |
188 | ||
189 | dirty_init(w); | |
190 | io->bio.bi_rw = WRITE; | |
191 | io->bio.bi_sector = KEY_START(&w->key); | |
192 | io->bio.bi_bdev = io->dc->bdev; | |
193 | io->bio.bi_end_io = dirty_endio; | |
194 | ||
cafe5635 KO |
195 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
196 | ||
c2a4f318 | 197 | continue_at(cl, write_dirty_finish, system_wq); |
cafe5635 KO |
198 | } |
199 | ||
200 | static void read_dirty_endio(struct bio *bio, int error) | |
201 | { | |
202 | struct keybuf_key *w = bio->bi_private; | |
203 | struct dirty_io *io = w->private; | |
204 | ||
205 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | |
206 | error, "reading dirty data from cache"); | |
207 | ||
208 | dirty_endio(bio, error); | |
209 | } | |
210 | ||
211 | static void read_dirty_submit(struct closure *cl) | |
212 | { | |
213 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
214 | ||
cafe5635 KO |
215 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
216 | ||
c2a4f318 | 217 | continue_at(cl, write_dirty, system_wq); |
cafe5635 KO |
218 | } |
219 | ||
5e6926da | 220 | static void read_dirty(struct cached_dev *dc) |
cafe5635 | 221 | { |
5e6926da | 222 | unsigned delay = 0; |
cafe5635 KO |
223 | struct keybuf_key *w; |
224 | struct dirty_io *io; | |
5e6926da KO |
225 | struct closure cl; |
226 | ||
227 | closure_init_stack(&cl); | |
cafe5635 KO |
228 | |
229 | /* | |
230 | * XXX: if we error, background writeback just spins. Should use some | |
231 | * mempools. | |
232 | */ | |
233 | ||
5e6926da KO |
234 | while (!kthread_should_stop()) { |
235 | try_to_freeze(); | |
236 | ||
cafe5635 KO |
237 | w = bch_keybuf_next(&dc->writeback_keys); |
238 | if (!w) | |
239 | break; | |
240 | ||
241 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | |
242 | ||
5e6926da KO |
243 | if (KEY_START(&w->key) != dc->last_read || |
244 | jiffies_to_msecs(delay) > 50) | |
245 | while (!kthread_should_stop() && delay) | |
246 | delay = schedule_timeout_interruptible(delay); | |
cafe5635 KO |
247 | |
248 | dc->last_read = KEY_OFFSET(&w->key); | |
249 | ||
250 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | |
251 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | |
252 | GFP_KERNEL); | |
253 | if (!io) | |
254 | goto err; | |
255 | ||
256 | w->private = io; | |
257 | io->dc = dc; | |
258 | ||
259 | dirty_init(w); | |
260 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | |
261 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | |
262 | &w->key, 0)->bdev; | |
263 | io->bio.bi_rw = READ; | |
264 | io->bio.bi_end_io = read_dirty_endio; | |
265 | ||
8e51e414 | 266 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
cafe5635 KO |
267 | goto err_free; |
268 | ||
c37511b8 | 269 | trace_bcache_writeback(&w->key); |
cafe5635 | 270 | |
c2a4f318 | 271 | down(&dc->in_flight); |
5e6926da | 272 | closure_call(&io->cl, read_dirty_submit, NULL, &cl); |
cafe5635 KO |
273 | |
274 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | |
cafe5635 KO |
275 | } |
276 | ||
277 | if (0) { | |
278 | err_free: | |
279 | kfree(w->private); | |
280 | err: | |
281 | bch_keybuf_del(&dc->writeback_keys, w); | |
282 | } | |
283 | ||
c2a4f318 KO |
284 | /* |
285 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | |
286 | * freed) before refilling again | |
287 | */ | |
5e6926da KO |
288 | closure_sync(&cl); |
289 | } | |
290 | ||
291 | /* Scan for dirty data */ | |
292 | ||
293 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | |
294 | uint64_t offset, int nr_sectors) | |
295 | { | |
296 | struct bcache_device *d = c->devices[inode]; | |
297 | unsigned stripe_offset; | |
298 | uint64_t stripe = offset; | |
299 | ||
300 | if (!d) | |
301 | return; | |
302 | ||
303 | do_div(stripe, d->stripe_size); | |
304 | ||
305 | stripe_offset = offset & (d->stripe_size - 1); | |
306 | ||
307 | while (nr_sectors) { | |
308 | int s = min_t(unsigned, abs(nr_sectors), | |
309 | d->stripe_size - stripe_offset); | |
310 | ||
311 | if (nr_sectors < 0) | |
312 | s = -s; | |
313 | ||
314 | atomic_add(s, d->stripe_sectors_dirty + stripe); | |
315 | nr_sectors -= s; | |
316 | stripe_offset = 0; | |
317 | stripe++; | |
318 | } | |
319 | } | |
320 | ||
321 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |
322 | { | |
323 | return KEY_DIRTY(k); | |
324 | } | |
325 | ||
326 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) | |
327 | { | |
328 | uint64_t stripe = KEY_START(k); | |
329 | unsigned nr_sectors = KEY_SIZE(k); | |
330 | struct cached_dev *dc = container_of(buf, struct cached_dev, | |
331 | writeback_keys); | |
332 | ||
333 | if (!KEY_DIRTY(k)) | |
334 | return false; | |
335 | ||
336 | do_div(stripe, dc->disk.stripe_size); | |
337 | ||
338 | while (1) { | |
339 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == | |
340 | dc->disk.stripe_size) | |
341 | return true; | |
342 | ||
343 | if (nr_sectors <= dc->disk.stripe_size) | |
344 | return false; | |
345 | ||
346 | nr_sectors -= dc->disk.stripe_size; | |
347 | stripe++; | |
348 | } | |
349 | } | |
350 | ||
351 | static bool refill_dirty(struct cached_dev *dc) | |
352 | { | |
353 | struct keybuf *buf = &dc->writeback_keys; | |
354 | bool searched_from_start = false; | |
355 | struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); | |
356 | ||
357 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | |
358 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | |
359 | searched_from_start = true; | |
360 | } | |
361 | ||
362 | if (dc->partial_stripes_expensive) { | |
363 | uint64_t i; | |
364 | ||
365 | for (i = 0; i < dc->disk.nr_stripes; i++) | |
366 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | |
367 | dc->disk.stripe_size) | |
368 | goto full_stripes; | |
369 | ||
370 | goto normal_refill; | |
371 | full_stripes: | |
372 | searched_from_start = false; /* not searching entire btree */ | |
373 | bch_refill_keybuf(dc->disk.c, buf, &end, | |
374 | dirty_full_stripe_pred); | |
375 | } else { | |
376 | normal_refill: | |
377 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | |
378 | } | |
379 | ||
380 | return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; | |
381 | } | |
382 | ||
383 | static int bch_writeback_thread(void *arg) | |
384 | { | |
385 | struct cached_dev *dc = arg; | |
386 | bool searched_full_index; | |
387 | ||
388 | while (!kthread_should_stop()) { | |
389 | down_write(&dc->writeback_lock); | |
390 | if (!atomic_read(&dc->has_dirty) || | |
391 | (!atomic_read(&dc->disk.detaching) && | |
392 | !dc->writeback_running)) { | |
393 | up_write(&dc->writeback_lock); | |
394 | set_current_state(TASK_INTERRUPTIBLE); | |
395 | ||
396 | if (kthread_should_stop()) | |
397 | return 0; | |
398 | ||
399 | try_to_freeze(); | |
400 | schedule(); | |
401 | continue; | |
402 | } | |
403 | ||
404 | searched_full_index = refill_dirty(dc); | |
405 | ||
406 | if (searched_full_index && | |
407 | RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { | |
408 | atomic_set(&dc->has_dirty, 0); | |
409 | cached_dev_put(dc); | |
410 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | |
411 | bch_write_bdev_super(dc, NULL); | |
412 | } | |
413 | ||
414 | up_write(&dc->writeback_lock); | |
415 | ||
416 | bch_ratelimit_reset(&dc->writeback_rate); | |
417 | read_dirty(dc); | |
418 | ||
419 | if (searched_full_index) { | |
420 | unsigned delay = dc->writeback_delay * HZ; | |
421 | ||
422 | while (delay && | |
423 | !kthread_should_stop() && | |
424 | !atomic_read(&dc->disk.detaching)) | |
425 | delay = schedule_timeout_interruptible(delay); | |
426 | } | |
427 | } | |
428 | ||
429 | return 0; | |
cafe5635 KO |
430 | } |
431 | ||
444fc0b6 KO |
432 | /* Init */ |
433 | ||
c18536a7 KO |
434 | struct sectors_dirty_init { |
435 | struct btree_op op; | |
436 | unsigned inode; | |
437 | }; | |
438 | ||
439 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, | |
48dad8ba | 440 | struct bkey *k) |
444fc0b6 | 441 | { |
c18536a7 KO |
442 | struct sectors_dirty_init *op = container_of(_op, |
443 | struct sectors_dirty_init, op); | |
48dad8ba KO |
444 | if (KEY_INODE(k) > op->inode) |
445 | return MAP_DONE; | |
444fc0b6 | 446 | |
48dad8ba KO |
447 | if (KEY_DIRTY(k)) |
448 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | |
449 | KEY_START(k), KEY_SIZE(k)); | |
450 | ||
451 | return MAP_CONTINUE; | |
444fc0b6 KO |
452 | } |
453 | ||
454 | void bch_sectors_dirty_init(struct cached_dev *dc) | |
455 | { | |
c18536a7 | 456 | struct sectors_dirty_init op; |
444fc0b6 | 457 | |
b54d6934 | 458 | bch_btree_op_init(&op.op, -1); |
48dad8ba KO |
459 | op.inode = dc->disk.id; |
460 | ||
c18536a7 | 461 | bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), |
48dad8ba | 462 | sectors_dirty_init_fn, 0); |
444fc0b6 KO |
463 | } |
464 | ||
5e6926da | 465 | int bch_cached_dev_writeback_init(struct cached_dev *dc) |
cafe5635 | 466 | { |
c2a4f318 | 467 | sema_init(&dc->in_flight, 64); |
cafe5635 | 468 | init_rwsem(&dc->writeback_lock); |
72c27061 | 469 | bch_keybuf_init(&dc->writeback_keys); |
cafe5635 KO |
470 | |
471 | dc->writeback_metadata = true; | |
472 | dc->writeback_running = true; | |
473 | dc->writeback_percent = 10; | |
474 | dc->writeback_delay = 30; | |
475 | dc->writeback_rate.rate = 1024; | |
476 | ||
477 | dc->writeback_rate_update_seconds = 30; | |
478 | dc->writeback_rate_d_term = 16; | |
479 | dc->writeback_rate_p_term_inverse = 64; | |
480 | dc->writeback_rate_d_smooth = 8; | |
481 | ||
5e6926da KO |
482 | dc->writeback_thread = kthread_create(bch_writeback_thread, dc, |
483 | "bcache_writeback"); | |
484 | if (IS_ERR(dc->writeback_thread)) | |
485 | return PTR_ERR(dc->writeback_thread); | |
486 | ||
487 | set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); | |
488 | ||
cafe5635 KO |
489 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
490 | schedule_delayed_work(&dc->writeback_rate_update, | |
491 | dc->writeback_rate_update_seconds * HZ); | |
cafe5635 KO |
492 | |
493 | return 0; | |
494 | } |