bcache: Move keylist out of btree_op
[GitHub/exynos8895/android_kernel_samsung_universal8895.git] / drivers / md / bcache / btree.c
CommitLineData
cafe5635
KO
1/*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
6 * of the device.
7 *
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */
22
23#include "bcache.h"
24#include "btree.h"
25#include "debug.h"
26#include "request.h"
279afbad 27#include "writeback.h"
cafe5635
KO
28
29#include <linux/slab.h>
30#include <linux/bitops.h>
31#include <linux/hash.h>
cd953ed0 32#include <linux/prefetch.h>
cafe5635
KO
33#include <linux/random.h>
34#include <linux/rcupdate.h>
35#include <trace/events/bcache.h>
36
37/*
38 * Todo:
39 * register_bcache: Return errors out to userspace correctly
40 *
41 * Writeback: don't undirty key until after a cache flush
42 *
43 * Create an iterator for key pointers
44 *
45 * On btree write error, mark bucket such that it won't be freed from the cache
46 *
47 * Journalling:
48 * Check for bad keys in replay
49 * Propagate barriers
50 * Refcount journal entries in journal_replay
51 *
52 * Garbage collection:
53 * Finish incremental gc
54 * Gc should free old UUIDs, data for invalid UUIDs
55 *
56 * Provide a way to list backing device UUIDs we have data cached for, and
57 * probably how long it's been since we've seen them, and a way to invalidate
58 * dirty data for devices that will never be attached again
59 *
60 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
61 * that based on that and how much dirty data we have we can keep writeback
62 * from being starved
63 *
64 * Add a tracepoint or somesuch to watch for writeback starvation
65 *
66 * When btree depth > 1 and splitting an interior node, we have to make sure
67 * alloc_bucket() cannot fail. This should be true but is not completely
68 * obvious.
69 *
70 * Make sure all allocations get charged to the root cgroup
71 *
72 * Plugging?
73 *
74 * If data write is less than hard sector size of ssd, round up offset in open
75 * bucket to the next whole sector
76 *
77 * Also lookup by cgroup in get_open_bucket()
78 *
79 * Superblock needs to be fleshed out for multiple cache devices
80 *
81 * Add a sysfs tunable for the number of writeback IOs in flight
82 *
83 * Add a sysfs tunable for the number of open data buckets
84 *
85 * IO tracking: Can we track when one process is doing io on behalf of another?
86 * IO tracking: Don't use just an average, weigh more recent stuff higher
87 *
88 * Test module load/unload
89 */
90
91static const char * const op_types[] = {
92 "insert", "replace"
93};
94
95static const char *op_type(struct btree_op *op)
96{
97 return op_types[op->type];
98}
99
100#define MAX_NEED_GC 64
101#define MAX_SAVE_PRIO 72
102
103#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
104
105#define PTR_HASH(c, k) \
106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107
108struct workqueue_struct *bch_gc_wq;
109static struct workqueue_struct *btree_io_wq;
110
111void bch_btree_op_init_stack(struct btree_op *op)
112{
113 memset(op, 0, sizeof(struct btree_op));
114 closure_init_stack(&op->cl);
115 op->lock = -1;
cafe5635
KO
116}
117
118/* Btree key manipulation */
119
e7c590eb
KO
120void __bkey_put(struct cache_set *c, struct bkey *k)
121{
122 unsigned i;
123
124 for (i = 0; i < KEY_PTRS(k); i++)
125 if (ptr_available(c, k, i))
126 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
127}
128
cafe5635
KO
129static void bkey_put(struct cache_set *c, struct bkey *k, int level)
130{
131 if ((level && KEY_OFFSET(k)) || !level)
132 __bkey_put(c, k);
133}
134
135/* Btree IO */
136
137static uint64_t btree_csum_set(struct btree *b, struct bset *i)
138{
139 uint64_t crc = b->key.ptr[0];
140 void *data = (void *) i + 8, *end = end(i);
141
169ef1cf 142 crc = bch_crc64_update(crc, data, end - data);
c19ed23a 143 return crc ^ 0xffffffffffffffffULL;
cafe5635
KO
144}
145
f3059a54 146static void bch_btree_node_read_done(struct btree *b)
cafe5635 147{
cafe5635 148 const char *err = "bad btree header";
57943511
KO
149 struct bset *i = b->sets[0].data;
150 struct btree_iter *iter;
cafe5635 151
57943511
KO
152 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
153 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
cafe5635
KO
154 iter->used = 0;
155
57943511 156 if (!i->seq)
cafe5635
KO
157 goto err;
158
159 for (;
160 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
161 i = write_block(b)) {
162 err = "unsupported bset version";
163 if (i->version > BCACHE_BSET_VERSION)
164 goto err;
165
166 err = "bad btree header";
167 if (b->written + set_blocks(i, b->c) > btree_blocks(b))
168 goto err;
169
170 err = "bad magic";
171 if (i->magic != bset_magic(b->c))
172 goto err;
173
174 err = "bad checksum";
175 switch (i->version) {
176 case 0:
177 if (i->csum != csum_set(i))
178 goto err;
179 break;
180 case BCACHE_BSET_VERSION:
181 if (i->csum != btree_csum_set(b, i))
182 goto err;
183 break;
184 }
185
186 err = "empty set";
187 if (i != b->sets[0].data && !i->keys)
188 goto err;
189
190 bch_btree_iter_push(iter, i->start, end(i));
191
192 b->written += set_blocks(i, b->c);
193 }
194
195 err = "corrupted btree";
196 for (i = write_block(b);
197 index(i, b) < btree_blocks(b);
198 i = ((void *) i) + block_bytes(b->c))
199 if (i->seq == b->sets[0].data->seq)
200 goto err;
201
202 bch_btree_sort_and_fix_extents(b, iter);
203
204 i = b->sets[0].data;
205 err = "short btree key";
206 if (b->sets[0].size &&
207 bkey_cmp(&b->key, &b->sets[0].end) < 0)
208 goto err;
209
210 if (b->written < btree_blocks(b))
211 bch_bset_init_next(b);
212out:
57943511
KO
213 mempool_free(iter, b->c->fill_iter);
214 return;
cafe5635
KO
215err:
216 set_btree_node_io_error(b);
07e86ccb 217 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
cafe5635
KO
218 err, PTR_BUCKET_NR(b->c, &b->key, 0),
219 index(i, b), i->keys);
220 goto out;
221}
222
57943511 223static void btree_node_read_endio(struct bio *bio, int error)
cafe5635 224{
57943511
KO
225 struct closure *cl = bio->bi_private;
226 closure_put(cl);
227}
cafe5635 228
57943511
KO
229void bch_btree_node_read(struct btree *b)
230{
231 uint64_t start_time = local_clock();
232 struct closure cl;
233 struct bio *bio;
cafe5635 234
c37511b8
KO
235 trace_bcache_btree_read(b);
236
57943511 237 closure_init_stack(&cl);
cafe5635 238
57943511
KO
239 bio = bch_bbio_alloc(b->c);
240 bio->bi_rw = REQ_META|READ_SYNC;
241 bio->bi_size = KEY_SIZE(&b->key) << 9;
242 bio->bi_end_io = btree_node_read_endio;
243 bio->bi_private = &cl;
cafe5635 244
57943511 245 bch_bio_map(bio, b->sets[0].data);
cafe5635 246
57943511
KO
247 bch_submit_bbio(bio, b->c, &b->key, 0);
248 closure_sync(&cl);
cafe5635 249
57943511
KO
250 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
251 set_btree_node_io_error(b);
252
253 bch_bbio_free(bio, b->c);
254
255 if (btree_node_io_error(b))
256 goto err;
257
258 bch_btree_node_read_done(b);
259
260 spin_lock(&b->c->btree_read_time_lock);
261 bch_time_stats_update(&b->c->btree_read_time, start_time);
262 spin_unlock(&b->c->btree_read_time_lock);
263
264 return;
265err:
61cbd250 266 bch_cache_set_error(b->c, "io error reading bucket %zu",
57943511 267 PTR_BUCKET_NR(b->c, &b->key, 0));
cafe5635
KO
268}
269
270static void btree_complete_write(struct btree *b, struct btree_write *w)
271{
272 if (w->prio_blocked &&
273 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
119ba0f8 274 wake_up_allocators(b->c);
cafe5635
KO
275
276 if (w->journal) {
277 atomic_dec_bug(w->journal);
278 __closure_wake_up(&b->c->journal.wait);
279 }
280
cafe5635
KO
281 w->prio_blocked = 0;
282 w->journal = NULL;
cafe5635
KO
283}
284
57943511 285static void __btree_node_write_done(struct closure *cl)
cafe5635
KO
286{
287 struct btree *b = container_of(cl, struct btree, io.cl);
288 struct btree_write *w = btree_prev_write(b);
289
290 bch_bbio_free(b->bio, b->c);
291 b->bio = NULL;
292 btree_complete_write(b, w);
293
294 if (btree_node_dirty(b))
295 queue_delayed_work(btree_io_wq, &b->work,
296 msecs_to_jiffies(30000));
297
298 closure_return(cl);
299}
300
57943511 301static void btree_node_write_done(struct closure *cl)
cafe5635
KO
302{
303 struct btree *b = container_of(cl, struct btree, io.cl);
304 struct bio_vec *bv;
305 int n;
306
307 __bio_for_each_segment(bv, b->bio, n, 0)
308 __free_page(bv->bv_page);
309
57943511 310 __btree_node_write_done(cl);
cafe5635
KO
311}
312
57943511
KO
313static void btree_node_write_endio(struct bio *bio, int error)
314{
315 struct closure *cl = bio->bi_private;
316 struct btree *b = container_of(cl, struct btree, io.cl);
317
318 if (error)
319 set_btree_node_io_error(b);
320
321 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
322 closure_put(cl);
323}
324
325static void do_btree_node_write(struct btree *b)
cafe5635
KO
326{
327 struct closure *cl = &b->io.cl;
328 struct bset *i = b->sets[b->nsets].data;
329 BKEY_PADDED(key) k;
330
331 i->version = BCACHE_BSET_VERSION;
332 i->csum = btree_csum_set(b, i);
333
57943511
KO
334 BUG_ON(b->bio);
335 b->bio = bch_bbio_alloc(b->c);
336
337 b->bio->bi_end_io = btree_node_write_endio;
338 b->bio->bi_private = &b->io.cl;
e49c7c37
KO
339 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
340 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
169ef1cf 341 bch_bio_map(b->bio, i);
cafe5635 342
e49c7c37
KO
343 /*
344 * If we're appending to a leaf node, we don't technically need FUA -
345 * this write just needs to be persisted before the next journal write,
346 * which will be marked FLUSH|FUA.
347 *
348 * Similarly if we're writing a new btree root - the pointer is going to
349 * be in the next journal entry.
350 *
351 * But if we're writing a new btree node (that isn't a root) or
352 * appending to a non leaf btree node, we need either FUA or a flush
353 * when we write the parent with the new pointer. FUA is cheaper than a
354 * flush, and writes appending to leaf nodes aren't blocking anything so
355 * just make all btree node writes FUA to keep things sane.
356 */
357
cafe5635
KO
358 bkey_copy(&k.key, &b->key);
359 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
360
8e51e414 361 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
cafe5635
KO
362 int j;
363 struct bio_vec *bv;
364 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
365
366 bio_for_each_segment(bv, b->bio, j)
367 memcpy(page_address(bv->bv_page),
368 base + j * PAGE_SIZE, PAGE_SIZE);
369
cafe5635
KO
370 bch_submit_bbio(b->bio, b->c, &k.key, 0);
371
57943511 372 continue_at(cl, btree_node_write_done, NULL);
cafe5635
KO
373 } else {
374 b->bio->bi_vcnt = 0;
169ef1cf 375 bch_bio_map(b->bio, i);
cafe5635 376
cafe5635
KO
377 bch_submit_bbio(b->bio, b->c, &k.key, 0);
378
379 closure_sync(cl);
57943511 380 __btree_node_write_done(cl);
cafe5635
KO
381 }
382}
383
57943511 384void bch_btree_node_write(struct btree *b, struct closure *parent)
cafe5635
KO
385{
386 struct bset *i = b->sets[b->nsets].data;
387
c37511b8
KO
388 trace_bcache_btree_write(b);
389
cafe5635 390 BUG_ON(current->bio_list);
57943511
KO
391 BUG_ON(b->written >= btree_blocks(b));
392 BUG_ON(b->written && !i->keys);
393 BUG_ON(b->sets->data->seq != i->seq);
c37511b8 394 bch_check_key_order(b, i);
cafe5635 395
cafe5635
KO
396 cancel_delayed_work(&b->work);
397
57943511
KO
398 /* If caller isn't waiting for write, parent refcount is cache set */
399 closure_lock(&b->io, parent ?: &b->c->cl);
400
cafe5635
KO
401 clear_bit(BTREE_NODE_dirty, &b->flags);
402 change_bit(BTREE_NODE_write_idx, &b->flags);
403
57943511 404 do_btree_node_write(b);
cafe5635 405
cafe5635
KO
406 b->written += set_blocks(i, b->c);
407 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
408 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
409
410 bch_btree_sort_lazy(b);
411
412 if (b->written < btree_blocks(b))
413 bch_bset_init_next(b);
414}
415
57943511 416static void btree_node_write_work(struct work_struct *w)
cafe5635
KO
417{
418 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
419
57943511 420 rw_lock(true, b, b->level);
cafe5635
KO
421
422 if (btree_node_dirty(b))
57943511
KO
423 bch_btree_node_write(b, NULL);
424 rw_unlock(true, b);
cafe5635
KO
425}
426
57943511 427static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
cafe5635
KO
428{
429 struct bset *i = b->sets[b->nsets].data;
430 struct btree_write *w = btree_current_write(b);
431
57943511
KO
432 BUG_ON(!b->written);
433 BUG_ON(!i->keys);
cafe5635 434
57943511
KO
435 if (!btree_node_dirty(b))
436 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
cafe5635 437
57943511 438 set_btree_node_dirty(b);
cafe5635 439
57943511 440 if (op && op->journal) {
cafe5635
KO
441 if (w->journal &&
442 journal_pin_cmp(b->c, w, op)) {
443 atomic_dec_bug(w->journal);
444 w->journal = NULL;
445 }
446
447 if (!w->journal) {
448 w->journal = op->journal;
449 atomic_inc(w->journal);
450 }
451 }
452
cafe5635 453 /* Force write if set is too big */
57943511
KO
454 if (set_bytes(i) > PAGE_SIZE - 48 &&
455 !current->bio_list)
456 bch_btree_node_write(b, NULL);
cafe5635
KO
457}
458
459/*
460 * Btree in memory cache - allocation/freeing
461 * mca -> memory cache
462 */
463
464static void mca_reinit(struct btree *b)
465{
466 unsigned i;
467
468 b->flags = 0;
469 b->written = 0;
470 b->nsets = 0;
471
472 for (i = 0; i < MAX_BSETS; i++)
473 b->sets[i].size = 0;
474 /*
475 * Second loop starts at 1 because b->sets[0]->data is the memory we
476 * allocated
477 */
478 for (i = 1; i < MAX_BSETS; i++)
479 b->sets[i].data = NULL;
480}
481
482#define mca_reserve(c) (((c->root && c->root->level) \
483 ? c->root->level : 1) * 8 + 16)
484#define mca_can_free(c) \
485 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
486
487static void mca_data_free(struct btree *b)
488{
489 struct bset_tree *t = b->sets;
490 BUG_ON(!closure_is_unlocked(&b->io.cl));
491
492 if (bset_prev_bytes(b) < PAGE_SIZE)
493 kfree(t->prev);
494 else
495 free_pages((unsigned long) t->prev,
496 get_order(bset_prev_bytes(b)));
497
498 if (bset_tree_bytes(b) < PAGE_SIZE)
499 kfree(t->tree);
500 else
501 free_pages((unsigned long) t->tree,
502 get_order(bset_tree_bytes(b)));
503
504 free_pages((unsigned long) t->data, b->page_order);
505
506 t->prev = NULL;
507 t->tree = NULL;
508 t->data = NULL;
509 list_move(&b->list, &b->c->btree_cache_freed);
510 b->c->bucket_cache_used--;
511}
512
513static void mca_bucket_free(struct btree *b)
514{
515 BUG_ON(btree_node_dirty(b));
516
517 b->key.ptr[0] = 0;
518 hlist_del_init_rcu(&b->hash);
519 list_move(&b->list, &b->c->btree_cache_freeable);
520}
521
522static unsigned btree_order(struct bkey *k)
523{
524 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
525}
526
527static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
528{
529 struct bset_tree *t = b->sets;
530 BUG_ON(t->data);
531
532 b->page_order = max_t(unsigned,
533 ilog2(b->c->btree_pages),
534 btree_order(k));
535
536 t->data = (void *) __get_free_pages(gfp, b->page_order);
537 if (!t->data)
538 goto err;
539
540 t->tree = bset_tree_bytes(b) < PAGE_SIZE
541 ? kmalloc(bset_tree_bytes(b), gfp)
542 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
543 if (!t->tree)
544 goto err;
545
546 t->prev = bset_prev_bytes(b) < PAGE_SIZE
547 ? kmalloc(bset_prev_bytes(b), gfp)
548 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
549 if (!t->prev)
550 goto err;
551
552 list_move(&b->list, &b->c->btree_cache);
553 b->c->bucket_cache_used++;
554 return;
555err:
556 mca_data_free(b);
557}
558
559static struct btree *mca_bucket_alloc(struct cache_set *c,
560 struct bkey *k, gfp_t gfp)
561{
562 struct btree *b = kzalloc(sizeof(struct btree), gfp);
563 if (!b)
564 return NULL;
565
566 init_rwsem(&b->lock);
567 lockdep_set_novalidate_class(&b->lock);
568 INIT_LIST_HEAD(&b->list);
57943511 569 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
cafe5635
KO
570 b->c = c;
571 closure_init_unlocked(&b->io);
572
573 mca_data_alloc(b, k, gfp);
574 return b;
575}
576
577static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
578{
579 lockdep_assert_held(&b->c->bucket_lock);
580
581 if (!down_write_trylock(&b->lock))
582 return -ENOMEM;
583
584 if (b->page_order < min_order) {
585 rw_unlock(true, b);
586 return -ENOMEM;
587 }
588
589 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
590
591 if (cl && btree_node_dirty(b))
57943511 592 bch_btree_node_write(b, NULL);
cafe5635
KO
593
594 if (cl)
595 closure_wait_event_async(&b->io.wait, cl,
596 atomic_read(&b->io.cl.remaining) == -1);
597
598 if (btree_node_dirty(b) ||
599 !closure_is_unlocked(&b->io.cl) ||
600 work_pending(&b->work.work)) {
601 rw_unlock(true, b);
602 return -EAGAIN;
603 }
604
605 return 0;
606}
607
7dc19d5a
DC
608static unsigned long bch_mca_scan(struct shrinker *shrink,
609 struct shrink_control *sc)
cafe5635
KO
610{
611 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
612 struct btree *b, *t;
613 unsigned long i, nr = sc->nr_to_scan;
7dc19d5a 614 unsigned long freed = 0;
cafe5635
KO
615
616 if (c->shrinker_disabled)
7dc19d5a 617 return SHRINK_STOP;
cafe5635
KO
618
619 if (c->try_harder)
7dc19d5a 620 return SHRINK_STOP;
cafe5635
KO
621
622 /* Return -1 if we can't do anything right now */
a698e08c 623 if (sc->gfp_mask & __GFP_IO)
cafe5635
KO
624 mutex_lock(&c->bucket_lock);
625 else if (!mutex_trylock(&c->bucket_lock))
626 return -1;
627
36c9ea98
KO
628 /*
629 * It's _really_ critical that we don't free too many btree nodes - we
630 * have to always leave ourselves a reserve. The reserve is how we
631 * guarantee that allocating memory for a new btree node can always
632 * succeed, so that inserting keys into the btree can always succeed and
633 * IO can always make forward progress:
634 */
cafe5635
KO
635 nr /= c->btree_pages;
636 nr = min_t(unsigned long, nr, mca_can_free(c));
637
638 i = 0;
639 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
7dc19d5a 640 if (freed >= nr)
cafe5635
KO
641 break;
642
643 if (++i > 3 &&
644 !mca_reap(b, NULL, 0)) {
645 mca_data_free(b);
646 rw_unlock(true, b);
7dc19d5a 647 freed++;
cafe5635
KO
648 }
649 }
650
651 /*
652 * Can happen right when we first start up, before we've read in any
653 * btree nodes
654 */
655 if (list_empty(&c->btree_cache))
656 goto out;
657
7dc19d5a 658 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
cafe5635
KO
659 b = list_first_entry(&c->btree_cache, struct btree, list);
660 list_rotate_left(&c->btree_cache);
661
662 if (!b->accessed &&
663 !mca_reap(b, NULL, 0)) {
664 mca_bucket_free(b);
665 mca_data_free(b);
666 rw_unlock(true, b);
7dc19d5a 667 freed++;
cafe5635
KO
668 } else
669 b->accessed = 0;
670 }
671out:
cafe5635 672 mutex_unlock(&c->bucket_lock);
7dc19d5a
DC
673 return freed;
674}
675
676static unsigned long bch_mca_count(struct shrinker *shrink,
677 struct shrink_control *sc)
678{
679 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
680
681 if (c->shrinker_disabled)
682 return 0;
683
684 if (c->try_harder)
685 return 0;
686
687 return mca_can_free(c) * c->btree_pages;
cafe5635
KO
688}
689
690void bch_btree_cache_free(struct cache_set *c)
691{
692 struct btree *b;
693 struct closure cl;
694 closure_init_stack(&cl);
695
696 if (c->shrink.list.next)
697 unregister_shrinker(&c->shrink);
698
699 mutex_lock(&c->bucket_lock);
700
701#ifdef CONFIG_BCACHE_DEBUG
702 if (c->verify_data)
703 list_move(&c->verify_data->list, &c->btree_cache);
704#endif
705
706 list_splice(&c->btree_cache_freeable,
707 &c->btree_cache);
708
709 while (!list_empty(&c->btree_cache)) {
710 b = list_first_entry(&c->btree_cache, struct btree, list);
711
712 if (btree_node_dirty(b))
713 btree_complete_write(b, btree_current_write(b));
714 clear_bit(BTREE_NODE_dirty, &b->flags);
715
716 mca_data_free(b);
717 }
718
719 while (!list_empty(&c->btree_cache_freed)) {
720 b = list_first_entry(&c->btree_cache_freed,
721 struct btree, list);
722 list_del(&b->list);
723 cancel_delayed_work_sync(&b->work);
724 kfree(b);
725 }
726
727 mutex_unlock(&c->bucket_lock);
728}
729
730int bch_btree_cache_alloc(struct cache_set *c)
731{
732 unsigned i;
733
734 /* XXX: doesn't check for errors */
735
736 closure_init_unlocked(&c->gc);
737
738 for (i = 0; i < mca_reserve(c); i++)
739 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
740
741 list_splice_init(&c->btree_cache,
742 &c->btree_cache_freeable);
743
744#ifdef CONFIG_BCACHE_DEBUG
745 mutex_init(&c->verify_lock);
746
747 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
748
749 if (c->verify_data &&
750 c->verify_data->sets[0].data)
751 list_del_init(&c->verify_data->list);
752 else
753 c->verify_data = NULL;
754#endif
755
7dc19d5a
DC
756 c->shrink.count_objects = bch_mca_count;
757 c->shrink.scan_objects = bch_mca_scan;
cafe5635
KO
758 c->shrink.seeks = 4;
759 c->shrink.batch = c->btree_pages * 2;
760 register_shrinker(&c->shrink);
761
762 return 0;
763}
764
765/* Btree in memory cache - hash table */
766
767static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
768{
769 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
770}
771
772static struct btree *mca_find(struct cache_set *c, struct bkey *k)
773{
774 struct btree *b;
775
776 rcu_read_lock();
777 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
778 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
779 goto out;
780 b = NULL;
781out:
782 rcu_read_unlock();
783 return b;
784}
785
786static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
787 int level, struct closure *cl)
788{
789 int ret = -ENOMEM;
790 struct btree *i;
791
c37511b8
KO
792 trace_bcache_btree_cache_cannibalize(c);
793
cafe5635
KO
794 if (!cl)
795 return ERR_PTR(-ENOMEM);
796
797 /*
798 * Trying to free up some memory - i.e. reuse some btree nodes - may
799 * require initiating IO to flush the dirty part of the node. If we're
800 * running under generic_make_request(), that IO will never finish and
801 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
802 * punt to workqueue and retry.
803 */
804 if (current->bio_list)
805 return ERR_PTR(-EAGAIN);
806
807 if (c->try_harder && c->try_harder != cl) {
808 closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
809 return ERR_PTR(-EAGAIN);
810 }
811
cafe5635
KO
812 c->try_harder = cl;
813 c->try_harder_start = local_clock();
814retry:
815 list_for_each_entry_reverse(i, &c->btree_cache, list) {
816 int r = mca_reap(i, cl, btree_order(k));
817 if (!r)
818 return i;
819 if (r != -ENOMEM)
820 ret = r;
821 }
822
823 if (ret == -EAGAIN &&
824 closure_blocking(cl)) {
825 mutex_unlock(&c->bucket_lock);
826 closure_sync(cl);
827 mutex_lock(&c->bucket_lock);
828 goto retry;
829 }
830
831 return ERR_PTR(ret);
832}
833
834/*
835 * We can only have one thread cannibalizing other cached btree nodes at a time,
836 * or we'll deadlock. We use an open coded mutex to ensure that, which a
837 * cannibalize_bucket() will take. This means every time we unlock the root of
838 * the btree, we need to release this lock if we have it held.
839 */
840void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
841{
842 if (c->try_harder == cl) {
169ef1cf 843 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
cafe5635
KO
844 c->try_harder = NULL;
845 __closure_wake_up(&c->try_wait);
846 }
847}
848
849static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
850 int level, struct closure *cl)
851{
852 struct btree *b;
853
854 lockdep_assert_held(&c->bucket_lock);
855
856 if (mca_find(c, k))
857 return NULL;
858
859 /* btree_free() doesn't free memory; it sticks the node on the end of
860 * the list. Check if there's any freed nodes there:
861 */
862 list_for_each_entry(b, &c->btree_cache_freeable, list)
863 if (!mca_reap(b, NULL, btree_order(k)))
864 goto out;
865
866 /* We never free struct btree itself, just the memory that holds the on
867 * disk node. Check the freed list before allocating a new one:
868 */
869 list_for_each_entry(b, &c->btree_cache_freed, list)
870 if (!mca_reap(b, NULL, 0)) {
871 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
872 if (!b->sets[0].data)
873 goto err;
874 else
875 goto out;
876 }
877
878 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
879 if (!b)
880 goto err;
881
882 BUG_ON(!down_write_trylock(&b->lock));
883 if (!b->sets->data)
884 goto err;
885out:
886 BUG_ON(!closure_is_unlocked(&b->io.cl));
887
888 bkey_copy(&b->key, k);
889 list_move(&b->list, &c->btree_cache);
890 hlist_del_init_rcu(&b->hash);
891 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
892
893 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
894 b->level = level;
d6fd3b11 895 b->parent = (void *) ~0UL;
cafe5635
KO
896
897 mca_reinit(b);
898
899 return b;
900err:
901 if (b)
902 rw_unlock(true, b);
903
904 b = mca_cannibalize(c, k, level, cl);
905 if (!IS_ERR(b))
906 goto out;
907
908 return b;
909}
910
911/**
912 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
913 * in from disk if necessary.
914 *
915 * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
916 * if that closure is in non blocking mode, will return -EAGAIN.
917 *
918 * The btree node will have either a read or a write lock held, depending on
919 * level and op->lock.
920 */
921struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
922 int level, struct btree_op *op)
923{
924 int i = 0;
925 bool write = level <= op->lock;
926 struct btree *b;
927
928 BUG_ON(level < 0);
929retry:
930 b = mca_find(c, k);
931
932 if (!b) {
57943511
KO
933 if (current->bio_list)
934 return ERR_PTR(-EAGAIN);
935
cafe5635
KO
936 mutex_lock(&c->bucket_lock);
937 b = mca_alloc(c, k, level, &op->cl);
938 mutex_unlock(&c->bucket_lock);
939
940 if (!b)
941 goto retry;
942 if (IS_ERR(b))
943 return b;
944
57943511 945 bch_btree_node_read(b);
cafe5635
KO
946
947 if (!write)
948 downgrade_write(&b->lock);
949 } else {
950 rw_lock(write, b, level);
951 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
952 rw_unlock(write, b);
953 goto retry;
954 }
955 BUG_ON(b->level != level);
956 }
957
958 b->accessed = 1;
959
960 for (; i <= b->nsets && b->sets[i].size; i++) {
961 prefetch(b->sets[i].tree);
962 prefetch(b->sets[i].data);
963 }
964
965 for (; i <= b->nsets; i++)
966 prefetch(b->sets[i].data);
967
57943511 968 if (btree_node_io_error(b)) {
cafe5635 969 rw_unlock(write, b);
57943511
KO
970 return ERR_PTR(-EIO);
971 }
972
973 BUG_ON(!b->written);
cafe5635
KO
974
975 return b;
976}
977
978static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
979{
980 struct btree *b;
981
982 mutex_lock(&c->bucket_lock);
983 b = mca_alloc(c, k, level, NULL);
984 mutex_unlock(&c->bucket_lock);
985
986 if (!IS_ERR_OR_NULL(b)) {
57943511 987 bch_btree_node_read(b);
cafe5635
KO
988 rw_unlock(true, b);
989 }
990}
991
992/* Btree alloc */
993
994static void btree_node_free(struct btree *b, struct btree_op *op)
995{
996 unsigned i;
997
c37511b8
KO
998 trace_bcache_btree_node_free(b);
999
cafe5635
KO
1000 /*
1001 * The BUG_ON() in btree_node_get() implies that we must have a write
1002 * lock on parent to free or even invalidate a node
1003 */
1004 BUG_ON(op->lock <= b->level);
1005 BUG_ON(b == b->c->root);
cafe5635
KO
1006
1007 if (btree_node_dirty(b))
1008 btree_complete_write(b, btree_current_write(b));
1009 clear_bit(BTREE_NODE_dirty, &b->flags);
1010
cafe5635
KO
1011 cancel_delayed_work(&b->work);
1012
1013 mutex_lock(&b->c->bucket_lock);
1014
1015 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1016 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
1017
1018 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1019 PTR_BUCKET(b->c, &b->key, i));
1020 }
1021
1022 bch_bucket_free(b->c, &b->key);
1023 mca_bucket_free(b);
1024 mutex_unlock(&b->c->bucket_lock);
1025}
1026
1027struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
1028 struct closure *cl)
1029{
1030 BKEY_PADDED(key) k;
1031 struct btree *b = ERR_PTR(-EAGAIN);
1032
1033 mutex_lock(&c->bucket_lock);
1034retry:
1035 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
1036 goto err;
1037
1038 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1039
1040 b = mca_alloc(c, &k.key, level, cl);
1041 if (IS_ERR(b))
1042 goto err_free;
1043
1044 if (!b) {
b1a67b0f
KO
1045 cache_bug(c,
1046 "Tried to allocate bucket that was in btree cache");
cafe5635
KO
1047 __bkey_put(c, &k.key);
1048 goto retry;
1049 }
1050
cafe5635
KO
1051 b->accessed = 1;
1052 bch_bset_init_next(b);
1053
1054 mutex_unlock(&c->bucket_lock);
c37511b8
KO
1055
1056 trace_bcache_btree_node_alloc(b);
cafe5635
KO
1057 return b;
1058err_free:
1059 bch_bucket_free(c, &k.key);
1060 __bkey_put(c, &k.key);
1061err:
1062 mutex_unlock(&c->bucket_lock);
c37511b8
KO
1063
1064 trace_bcache_btree_node_alloc_fail(b);
cafe5635
KO
1065 return b;
1066}
1067
1068static struct btree *btree_node_alloc_replacement(struct btree *b,
1069 struct closure *cl)
1070{
1071 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
1072 if (!IS_ERR_OR_NULL(n))
1073 bch_btree_sort_into(b, n);
1074
1075 return n;
1076}
1077
1078/* Garbage collection */
1079
1080uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1081{
1082 uint8_t stale = 0;
1083 unsigned i;
1084 struct bucket *g;
1085
1086 /*
1087 * ptr_invalid() can't return true for the keys that mark btree nodes as
1088 * freed, but since ptr_bad() returns true we'll never actually use them
1089 * for anything and thus we don't want mark their pointers here
1090 */
1091 if (!bkey_cmp(k, &ZERO_KEY))
1092 return stale;
1093
1094 for (i = 0; i < KEY_PTRS(k); i++) {
1095 if (!ptr_available(c, k, i))
1096 continue;
1097
1098 g = PTR_BUCKET(c, k, i);
1099
1100 if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1101 g->gc_gen = PTR_GEN(k, i);
1102
1103 if (ptr_stale(c, k, i)) {
1104 stale = max(stale, ptr_stale(c, k, i));
1105 continue;
1106 }
1107
1108 cache_bug_on(GC_MARK(g) &&
1109 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1110 c, "inconsistent ptrs: mark = %llu, level = %i",
1111 GC_MARK(g), level);
1112
1113 if (level)
1114 SET_GC_MARK(g, GC_MARK_METADATA);
1115 else if (KEY_DIRTY(k))
1116 SET_GC_MARK(g, GC_MARK_DIRTY);
1117
1118 /* guard against overflow */
1119 SET_GC_SECTORS_USED(g, min_t(unsigned,
1120 GC_SECTORS_USED(g) + KEY_SIZE(k),
1121 (1 << 14) - 1));
1122
1123 BUG_ON(!GC_SECTORS_USED(g));
1124 }
1125
1126 return stale;
1127}
1128
1129#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1130
1131static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1132 struct gc_stat *gc)
1133{
1134 uint8_t stale = 0;
1135 unsigned last_dev = -1;
1136 struct bcache_device *d = NULL;
1137 struct bkey *k;
1138 struct btree_iter iter;
1139 struct bset_tree *t;
1140
1141 gc->nodes++;
1142
1143 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1144 if (last_dev != KEY_INODE(k)) {
1145 last_dev = KEY_INODE(k);
1146
1147 d = KEY_INODE(k) < b->c->nr_uuids
1148 ? b->c->devices[last_dev]
1149 : NULL;
1150 }
1151
1152 stale = max(stale, btree_mark_key(b, k));
1153
1154 if (bch_ptr_bad(b, k))
1155 continue;
1156
1157 *keys += bkey_u64s(k);
1158
1159 gc->key_bytes += bkey_u64s(k);
1160 gc->nkeys++;
1161
1162 gc->data += KEY_SIZE(k);
444fc0b6 1163 if (KEY_DIRTY(k))
cafe5635 1164 gc->dirty += KEY_SIZE(k);
cafe5635
KO
1165 }
1166
1167 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1168 btree_bug_on(t->size &&
1169 bset_written(b, t) &&
1170 bkey_cmp(&b->key, &t->end) < 0,
1171 b, "found short btree key in gc");
1172
1173 return stale;
1174}
1175
1176static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1177 struct btree_op *op)
1178{
1179 /*
1180 * We block priorities from being written for the duration of garbage
1181 * collection, so we can't sleep in btree_alloc() ->
1182 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1183 * our closure.
1184 */
1185 struct btree *n = btree_node_alloc_replacement(b, NULL);
1186
1187 if (!IS_ERR_OR_NULL(n)) {
1188 swap(b, n);
57943511 1189 __bkey_put(b->c, &b->key);
cafe5635
KO
1190
1191 memcpy(k->ptr, b->key.ptr,
1192 sizeof(uint64_t) * KEY_PTRS(&b->key));
1193
cafe5635
KO
1194 btree_node_free(n, op);
1195 up_write(&n->lock);
1196 }
1197
1198 return b;
1199}
1200
1201/*
1202 * Leaving this at 2 until we've got incremental garbage collection done; it
1203 * could be higher (and has been tested with 4) except that garbage collection
1204 * could take much longer, adversely affecting latency.
1205 */
1206#define GC_MERGE_NODES 2U
1207
1208struct gc_merge_info {
1209 struct btree *b;
1210 struct bkey *k;
1211 unsigned keys;
1212};
1213
1214static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1215 struct gc_stat *gc, struct gc_merge_info *r)
1216{
1217 unsigned nodes = 0, keys = 0, blocks;
1218 int i;
1219
1220 while (nodes < GC_MERGE_NODES && r[nodes].b)
1221 keys += r[nodes++].keys;
1222
1223 blocks = btree_default_blocks(b->c) * 2 / 3;
1224
1225 if (nodes < 2 ||
1226 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1227 return;
1228
1229 for (i = nodes - 1; i >= 0; --i) {
1230 if (r[i].b->written)
1231 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
1232
1233 if (r[i].b->written)
1234 return;
1235 }
1236
1237 for (i = nodes - 1; i > 0; --i) {
1238 struct bset *n1 = r[i].b->sets->data;
1239 struct bset *n2 = r[i - 1].b->sets->data;
1240 struct bkey *k, *last = NULL;
1241
1242 keys = 0;
1243
1244 if (i == 1) {
1245 /*
1246 * Last node we're not getting rid of - we're getting
1247 * rid of the node at r[0]. Have to try and fit all of
1248 * the remaining keys into this node; we can't ensure
1249 * they will always fit due to rounding and variable
1250 * length keys (shouldn't be possible in practice,
1251 * though)
1252 */
1253 if (__set_blocks(n1, n1->keys + r->keys,
1254 b->c) > btree_blocks(r[i].b))
1255 return;
1256
1257 keys = n2->keys;
1258 last = &r->b->key;
1259 } else
1260 for (k = n2->start;
1261 k < end(n2);
1262 k = bkey_next(k)) {
1263 if (__set_blocks(n1, n1->keys + keys +
1264 bkey_u64s(k), b->c) > blocks)
1265 break;
1266
1267 last = k;
1268 keys += bkey_u64s(k);
1269 }
1270
1271 BUG_ON(__set_blocks(n1, n1->keys + keys,
1272 b->c) > btree_blocks(r[i].b));
1273
1274 if (last) {
1275 bkey_copy_key(&r[i].b->key, last);
1276 bkey_copy_key(r[i].k, last);
1277 }
1278
1279 memcpy(end(n1),
1280 n2->start,
1281 (void *) node(n2, keys) - (void *) n2->start);
1282
1283 n1->keys += keys;
1284
1285 memmove(n2->start,
1286 node(n2, keys),
1287 (void *) end(n2) - (void *) node(n2, keys));
1288
1289 n2->keys -= keys;
1290
1291 r[i].keys = n1->keys;
1292 r[i - 1].keys = n2->keys;
1293 }
1294
1295 btree_node_free(r->b, op);
1296 up_write(&r->b->lock);
1297
c37511b8 1298 trace_bcache_btree_gc_coalesce(nodes);
cafe5635
KO
1299
1300 gc->nodes--;
1301 nodes--;
1302
1303 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
1304 memset(&r[nodes], 0, sizeof(struct gc_merge_info));
1305}
1306
1307static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1308 struct closure *writes, struct gc_stat *gc)
1309{
1310 void write(struct btree *r)
1311 {
1312 if (!r->written)
57943511
KO
1313 bch_btree_node_write(r, &op->cl);
1314 else if (btree_node_dirty(r))
1315 bch_btree_node_write(r, writes);
cafe5635
KO
1316
1317 up_write(&r->lock);
1318 }
1319
1320 int ret = 0, stale;
1321 unsigned i;
1322 struct gc_merge_info r[GC_MERGE_NODES];
1323
1324 memset(r, 0, sizeof(r));
1325
1326 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
1327 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
1328
1329 if (IS_ERR(r->b)) {
1330 ret = PTR_ERR(r->b);
1331 break;
1332 }
1333
1334 r->keys = 0;
1335 stale = btree_gc_mark_node(r->b, &r->keys, gc);
1336
1337 if (!b->written &&
1338 (r->b->level || stale > 10 ||
1339 b->c->gc_always_rewrite))
1340 r->b = btree_gc_alloc(r->b, r->k, op);
1341
1342 if (r->b->level)
1343 ret = btree_gc_recurse(r->b, op, writes, gc);
1344
1345 if (ret) {
1346 write(r->b);
1347 break;
1348 }
1349
1350 bkey_copy_key(&b->c->gc_done, r->k);
1351
1352 if (!b->written)
1353 btree_gc_coalesce(b, op, gc, r);
1354
1355 if (r[GC_MERGE_NODES - 1].b)
1356 write(r[GC_MERGE_NODES - 1].b);
1357
1358 memmove(&r[1], &r[0],
1359 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
1360
1361 /* When we've got incremental GC working, we'll want to do
1362 * if (should_resched())
1363 * return -EAGAIN;
1364 */
1365 cond_resched();
1366#if 0
1367 if (need_resched()) {
1368 ret = -EAGAIN;
1369 break;
1370 }
1371#endif
1372 }
1373
1374 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
1375 write(r[i].b);
1376
1377 /* Might have freed some children, must remove their keys */
1378 if (!b->written)
1379 bch_btree_sort(b);
1380
1381 return ret;
1382}
1383
1384static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1385 struct closure *writes, struct gc_stat *gc)
1386{
1387 struct btree *n = NULL;
1388 unsigned keys = 0;
1389 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
1390
1391 if (b->level || stale > 10)
1392 n = btree_node_alloc_replacement(b, NULL);
1393
1394 if (!IS_ERR_OR_NULL(n))
1395 swap(b, n);
1396
1397 if (b->level)
1398 ret = btree_gc_recurse(b, op, writes, gc);
1399
1400 if (!b->written || btree_node_dirty(b)) {
57943511 1401 bch_btree_node_write(b, n ? &op->cl : NULL);
cafe5635
KO
1402 }
1403
1404 if (!IS_ERR_OR_NULL(n)) {
1405 closure_sync(&op->cl);
1406 bch_btree_set_root(b);
1407 btree_node_free(n, op);
1408 rw_unlock(true, b);
1409 }
1410
1411 return ret;
1412}
1413
1414static void btree_gc_start(struct cache_set *c)
1415{
1416 struct cache *ca;
1417 struct bucket *b;
cafe5635
KO
1418 unsigned i;
1419
1420 if (!c->gc_mark_valid)
1421 return;
1422
1423 mutex_lock(&c->bucket_lock);
1424
1425 c->gc_mark_valid = 0;
1426 c->gc_done = ZERO_KEY;
1427
1428 for_each_cache(ca, c, i)
1429 for_each_bucket(b, ca) {
1430 b->gc_gen = b->gen;
29ebf465 1431 if (!atomic_read(&b->pin)) {
cafe5635 1432 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
29ebf465
KO
1433 SET_GC_SECTORS_USED(b, 0);
1434 }
cafe5635
KO
1435 }
1436
cafe5635
KO
1437 mutex_unlock(&c->bucket_lock);
1438}
1439
1440size_t bch_btree_gc_finish(struct cache_set *c)
1441{
1442 size_t available = 0;
1443 struct bucket *b;
1444 struct cache *ca;
cafe5635
KO
1445 unsigned i;
1446
1447 mutex_lock(&c->bucket_lock);
1448
1449 set_gc_sectors(c);
1450 c->gc_mark_valid = 1;
1451 c->need_gc = 0;
1452
1453 if (c->root)
1454 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1455 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1456 GC_MARK_METADATA);
1457
1458 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1459 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1460 GC_MARK_METADATA);
1461
1462 for_each_cache(ca, c, i) {
1463 uint64_t *i;
1464
1465 ca->invalidate_needs_gc = 0;
1466
1467 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1468 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1469
1470 for (i = ca->prio_buckets;
1471 i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1472 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1473
1474 for_each_bucket(b, ca) {
1475 b->last_gc = b->gc_gen;
1476 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1477
1478 if (!atomic_read(&b->pin) &&
1479 GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1480 available++;
1481 if (!GC_SECTORS_USED(b))
1482 bch_bucket_add_unused(ca, b);
1483 }
1484 }
1485 }
1486
cafe5635
KO
1487 mutex_unlock(&c->bucket_lock);
1488 return available;
1489}
1490
1491static void bch_btree_gc(struct closure *cl)
1492{
1493 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
1494 int ret;
1495 unsigned long available;
1496 struct gc_stat stats;
1497 struct closure writes;
1498 struct btree_op op;
cafe5635 1499 uint64_t start_time = local_clock();
57943511 1500
c37511b8 1501 trace_bcache_gc_start(c);
cafe5635
KO
1502
1503 memset(&stats, 0, sizeof(struct gc_stat));
1504 closure_init_stack(&writes);
1505 bch_btree_op_init_stack(&op);
1506 op.lock = SHRT_MAX;
1507
1508 btree_gc_start(c);
1509
57943511
KO
1510 atomic_inc(&c->prio_blocked);
1511
cafe5635
KO
1512 ret = btree_root(gc_root, c, &op, &writes, &stats);
1513 closure_sync(&op.cl);
1514 closure_sync(&writes);
1515
1516 if (ret) {
cafe5635 1517 pr_warn("gc failed!");
cafe5635
KO
1518 continue_at(cl, bch_btree_gc, bch_gc_wq);
1519 }
1520
1521 /* Possibly wait for new UUIDs or whatever to hit disk */
1522 bch_journal_meta(c, &op.cl);
1523 closure_sync(&op.cl);
1524
1525 available = bch_btree_gc_finish(c);
1526
57943511
KO
1527 atomic_dec(&c->prio_blocked);
1528 wake_up_allocators(c);
1529
169ef1cf 1530 bch_time_stats_update(&c->btree_gc_time, start_time);
cafe5635
KO
1531
1532 stats.key_bytes *= sizeof(uint64_t);
1533 stats.dirty <<= 9;
1534 stats.data <<= 9;
1535 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1536 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
cafe5635 1537
c37511b8 1538 trace_bcache_gc_end(c);
cafe5635
KO
1539
1540 continue_at(cl, bch_moving_gc, bch_gc_wq);
1541}
1542
1543void bch_queue_gc(struct cache_set *c)
1544{
1545 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
1546}
1547
1548/* Initial partial gc */
1549
1550static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1551 unsigned long **seen)
1552{
1553 int ret;
1554 unsigned i;
1555 struct bkey *k;
1556 struct bucket *g;
1557 struct btree_iter iter;
1558
1559 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1560 for (i = 0; i < KEY_PTRS(k); i++) {
1561 if (!ptr_available(b->c, k, i))
1562 continue;
1563
1564 g = PTR_BUCKET(b->c, k, i);
1565
1566 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1567 seen[PTR_DEV(k, i)]) ||
1568 !ptr_stale(b->c, k, i)) {
1569 g->gen = PTR_GEN(k, i);
1570
1571 if (b->level)
1572 g->prio = BTREE_PRIO;
1573 else if (g->prio == BTREE_PRIO)
1574 g->prio = INITIAL_PRIO;
1575 }
1576 }
1577
1578 btree_mark_key(b, k);
1579 }
1580
1581 if (b->level) {
1582 k = bch_next_recurse_key(b, &ZERO_KEY);
1583
1584 while (k) {
1585 struct bkey *p = bch_next_recurse_key(b, k);
1586 if (p)
1587 btree_node_prefetch(b->c, p, b->level - 1);
1588
1589 ret = btree(check_recurse, k, b, op, seen);
1590 if (ret)
1591 return ret;
1592
1593 k = p;
1594 }
1595 }
1596
1597 return 0;
1598}
1599
1600int bch_btree_check(struct cache_set *c, struct btree_op *op)
1601{
1602 int ret = -ENOMEM;
1603 unsigned i;
1604 unsigned long *seen[MAX_CACHES_PER_SET];
1605
1606 memset(seen, 0, sizeof(seen));
1607
1608 for (i = 0; c->cache[i]; i++) {
1609 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1610 seen[i] = kmalloc(n, GFP_KERNEL);
1611 if (!seen[i])
1612 goto err;
1613
1614 /* Disables the seen array until prio_read() uses it too */
1615 memset(seen[i], 0xFF, n);
1616 }
1617
1618 ret = btree_root(check_recurse, c, op, seen);
1619err:
1620 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1621 kfree(seen[i]);
1622 return ret;
1623}
1624
1625/* Btree insertion */
1626
1627static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1628{
1629 struct bset *i = b->sets[b->nsets].data;
1630
1631 memmove((uint64_t *) where + bkey_u64s(insert),
1632 where,
1633 (void *) end(i) - (void *) where);
1634
1635 i->keys += bkey_u64s(insert);
1636 bkey_copy(where, insert);
1637 bch_bset_fix_lookup_table(b, where);
1638}
1639
1640static bool fix_overlapping_extents(struct btree *b,
1641 struct bkey *insert,
1642 struct btree_iter *iter,
1643 struct btree_op *op)
1644{
279afbad 1645 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
cafe5635 1646 {
279afbad
KO
1647 if (KEY_DIRTY(k))
1648 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1649 offset, -sectors);
cafe5635
KO
1650 }
1651
279afbad 1652 uint64_t old_offset;
cafe5635
KO
1653 unsigned old_size, sectors_found = 0;
1654
1655 while (1) {
1656 struct bkey *k = bch_btree_iter_next(iter);
1657 if (!k ||
1658 bkey_cmp(&START_KEY(k), insert) >= 0)
1659 break;
1660
1661 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1662 continue;
1663
279afbad 1664 old_offset = KEY_START(k);
cafe5635
KO
1665 old_size = KEY_SIZE(k);
1666
1667 /*
1668 * We might overlap with 0 size extents; we can't skip these
1669 * because if they're in the set we're inserting to we have to
1670 * adjust them so they don't overlap with the key we're
1671 * inserting. But we don't want to check them for BTREE_REPLACE
1672 * operations.
1673 */
1674
1675 if (op->type == BTREE_REPLACE &&
1676 KEY_SIZE(k)) {
1677 /*
1678 * k might have been split since we inserted/found the
1679 * key we're replacing
1680 */
1681 unsigned i;
1682 uint64_t offset = KEY_START(k) -
1683 KEY_START(&op->replace);
1684
1685 /* But it must be a subset of the replace key */
1686 if (KEY_START(k) < KEY_START(&op->replace) ||
1687 KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
1688 goto check_failed;
1689
1690 /* We didn't find a key that we were supposed to */
1691 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1692 goto check_failed;
1693
1694 if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
1695 goto check_failed;
1696
1697 /* skip past gen */
1698 offset <<= 8;
1699
1700 BUG_ON(!KEY_PTRS(&op->replace));
1701
1702 for (i = 0; i < KEY_PTRS(&op->replace); i++)
1703 if (k->ptr[i] != op->replace.ptr[i] + offset)
1704 goto check_failed;
1705
1706 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1707 }
1708
1709 if (bkey_cmp(insert, k) < 0 &&
1710 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1711 /*
1712 * We overlapped in the middle of an existing key: that
1713 * means we have to split the old key. But we have to do
1714 * slightly different things depending on whether the
1715 * old key has been written out yet.
1716 */
1717
1718 struct bkey *top;
1719
279afbad 1720 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
cafe5635
KO
1721
1722 if (bkey_written(b, k)) {
1723 /*
1724 * We insert a new key to cover the top of the
1725 * old key, and the old key is modified in place
1726 * to represent the bottom split.
1727 *
1728 * It's completely arbitrary whether the new key
1729 * is the top or the bottom, but it has to match
1730 * up with what btree_sort_fixup() does - it
1731 * doesn't check for this kind of overlap, it
1732 * depends on us inserting a new key for the top
1733 * here.
1734 */
1735 top = bch_bset_search(b, &b->sets[b->nsets],
1736 insert);
1737 shift_keys(b, top, k);
1738 } else {
1739 BKEY_PADDED(key) temp;
1740 bkey_copy(&temp.key, k);
1741 shift_keys(b, k, &temp.key);
1742 top = bkey_next(k);
1743 }
1744
1745 bch_cut_front(insert, top);
1746 bch_cut_back(&START_KEY(insert), k);
1747 bch_bset_fix_invalidated_key(b, k);
1748 return false;
1749 }
1750
1751 if (bkey_cmp(insert, k) < 0) {
1752 bch_cut_front(insert, k);
1753 } else {
1fa8455d
KO
1754 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1755 old_offset = KEY_START(insert);
1756
cafe5635
KO
1757 if (bkey_written(b, k) &&
1758 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1759 /*
1760 * Completely overwrote, so we don't have to
1761 * invalidate the binary search tree
1762 */
1763 bch_cut_front(k, k);
1764 } else {
1765 __bch_cut_back(&START_KEY(insert), k);
1766 bch_bset_fix_invalidated_key(b, k);
1767 }
1768 }
1769
279afbad 1770 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
cafe5635
KO
1771 }
1772
1773check_failed:
1774 if (op->type == BTREE_REPLACE) {
1775 if (!sectors_found) {
1776 op->insert_collision = true;
1777 return true;
1778 } else if (sectors_found < KEY_SIZE(insert)) {
1779 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1780 (KEY_SIZE(insert) - sectors_found));
1781 SET_KEY_SIZE(insert, sectors_found);
1782 }
1783 }
1784
1785 return false;
1786}
1787
1788static bool btree_insert_key(struct btree *b, struct btree_op *op,
1789 struct bkey *k)
1790{
1791 struct bset *i = b->sets[b->nsets].data;
1792 struct bkey *m, *prev;
85b1492e 1793 unsigned status = BTREE_INSERT_STATUS_INSERT;
cafe5635
KO
1794
1795 BUG_ON(bkey_cmp(k, &b->key) > 0);
1796 BUG_ON(b->level && !KEY_PTRS(k));
1797 BUG_ON(!b->level && !KEY_OFFSET(k));
1798
1799 if (!b->level) {
1800 struct btree_iter iter;
1801 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
1802
1803 /*
1804 * bset_search() returns the first key that is strictly greater
1805 * than the search key - but for back merging, we want to find
1806 * the first key that is greater than or equal to KEY_START(k) -
1807 * unless KEY_START(k) is 0.
1808 */
1809 if (KEY_OFFSET(&search))
1810 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
1811
1812 prev = NULL;
1813 m = bch_btree_iter_init(b, &iter, &search);
1814
1815 if (fix_overlapping_extents(b, k, &iter, op))
1816 return false;
1817
1fa8455d
KO
1818 if (KEY_DIRTY(k))
1819 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1820 KEY_START(k), KEY_SIZE(k));
1821
cafe5635
KO
1822 while (m != end(i) &&
1823 bkey_cmp(k, &START_KEY(m)) > 0)
1824 prev = m, m = bkey_next(m);
1825
1826 if (key_merging_disabled(b->c))
1827 goto insert;
1828
1829 /* prev is in the tree, if we merge we're done */
85b1492e 1830 status = BTREE_INSERT_STATUS_BACK_MERGE;
cafe5635
KO
1831 if (prev &&
1832 bch_bkey_try_merge(b, prev, k))
1833 goto merged;
1834
85b1492e 1835 status = BTREE_INSERT_STATUS_OVERWROTE;
cafe5635
KO
1836 if (m != end(i) &&
1837 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1838 goto copy;
1839
85b1492e 1840 status = BTREE_INSERT_STATUS_FRONT_MERGE;
cafe5635
KO
1841 if (m != end(i) &&
1842 bch_bkey_try_merge(b, k, m))
1843 goto copy;
1844 } else
1845 m = bch_bset_search(b, &b->sets[b->nsets], k);
1846
1847insert: shift_keys(b, m, k);
1848copy: bkey_copy(m, k);
1849merged:
85b1492e 1850 bch_check_keys(b, "%u for %s", status, op_type(op));
cafe5635
KO
1851
1852 if (b->level && !KEY_OFFSET(k))
57943511 1853 btree_current_write(b)->prio_blocked++;
cafe5635 1854
85b1492e 1855 trace_bcache_btree_insert_key(b, k, op->type, status);
cafe5635
KO
1856
1857 return true;
1858}
1859
26c949f8
KO
1860static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
1861 struct keylist *insert_keys)
cafe5635
KO
1862{
1863 bool ret = false;
cafe5635
KO
1864 unsigned oldsize = bch_count_data(b);
1865
26c949f8 1866 while (!bch_keylist_empty(insert_keys)) {
403b6cde 1867 struct bset *i = write_block(b);
c2f95ae2 1868 struct bkey *k = insert_keys->keys;
26c949f8 1869
403b6cde
KO
1870 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
1871 > btree_blocks(b))
1872 break;
1873
1874 if (bkey_cmp(k, &b->key) <= 0) {
26c949f8
KO
1875 bkey_put(b->c, k, b->level);
1876
1877 ret |= btree_insert_key(b, op, k);
1878 bch_keylist_pop_front(insert_keys);
1879 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
1880#if 0
1881 if (op->type == BTREE_REPLACE) {
1882 bkey_put(b->c, k, b->level);
1883 bch_keylist_pop_front(insert_keys);
1884 op->insert_collision = true;
1885 break;
1886 }
1887#endif
1888 BKEY_PADDED(key) temp;
c2f95ae2 1889 bkey_copy(&temp.key, insert_keys->keys);
26c949f8
KO
1890
1891 bch_cut_back(&b->key, &temp.key);
c2f95ae2 1892 bch_cut_front(&b->key, insert_keys->keys);
26c949f8
KO
1893
1894 ret |= btree_insert_key(b, op, &temp.key);
1895 break;
1896 } else {
1897 break;
1898 }
cafe5635
KO
1899 }
1900
403b6cde
KO
1901 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
1902
cafe5635
KO
1903 BUG_ON(bch_count_data(b) < oldsize);
1904 return ret;
1905}
1906
26c949f8
KO
1907static int btree_split(struct btree *b, struct btree_op *op,
1908 struct keylist *insert_keys,
1909 struct keylist *parent_keys)
cafe5635 1910{
d6fd3b11 1911 bool split;
cafe5635
KO
1912 struct btree *n1, *n2 = NULL, *n3 = NULL;
1913 uint64_t start_time = local_clock();
1914
1915 if (b->level)
1916 set_closure_blocking(&op->cl);
1917
1918 n1 = btree_node_alloc_replacement(b, &op->cl);
1919 if (IS_ERR(n1))
1920 goto err;
1921
1922 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1923
cafe5635
KO
1924 if (split) {
1925 unsigned keys = 0;
1926
c37511b8
KO
1927 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1928
cafe5635
KO
1929 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1930 if (IS_ERR(n2))
1931 goto err_free1;
1932
d6fd3b11 1933 if (!b->parent) {
cafe5635
KO
1934 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
1935 if (IS_ERR(n3))
1936 goto err_free2;
1937 }
1938
26c949f8 1939 bch_btree_insert_keys(n1, op, insert_keys);
cafe5635 1940
d6fd3b11
KO
1941 /*
1942 * Has to be a linear search because we don't have an auxiliary
cafe5635
KO
1943 * search tree yet
1944 */
1945
1946 while (keys < (n1->sets[0].data->keys * 3) / 5)
1947 keys += bkey_u64s(node(n1->sets[0].data, keys));
1948
1949 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
1950 keys += bkey_u64s(node(n1->sets[0].data, keys));
1951
1952 n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
1953 n1->sets[0].data->keys = keys;
1954
1955 memcpy(n2->sets[0].data->start,
1956 end(n1->sets[0].data),
1957 n2->sets[0].data->keys * sizeof(uint64_t));
1958
1959 bkey_copy_key(&n2->key, &b->key);
1960
26c949f8 1961 bch_keylist_add(parent_keys, &n2->key);
57943511 1962 bch_btree_node_write(n2, &op->cl);
cafe5635 1963 rw_unlock(true, n2);
c37511b8
KO
1964 } else {
1965 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1966
26c949f8 1967 bch_btree_insert_keys(n1, op, insert_keys);
c37511b8 1968 }
cafe5635 1969
26c949f8 1970 bch_keylist_add(parent_keys, &n1->key);
57943511 1971 bch_btree_node_write(n1, &op->cl);
cafe5635
KO
1972
1973 if (n3) {
d6fd3b11
KO
1974 /* Depth increases, make a new root */
1975
cafe5635 1976 bkey_copy_key(&n3->key, &MAX_KEY);
26c949f8 1977 bch_btree_insert_keys(n3, op, parent_keys);
57943511 1978 bch_btree_node_write(n3, &op->cl);
cafe5635
KO
1979
1980 closure_sync(&op->cl);
1981 bch_btree_set_root(n3);
1982 rw_unlock(true, n3);
d6fd3b11
KO
1983 } else if (!b->parent) {
1984 /* Root filled up but didn't need to be split */
1985
c2f95ae2 1986 bch_keylist_reset(parent_keys);
cafe5635
KO
1987 closure_sync(&op->cl);
1988 bch_btree_set_root(n1);
1989 } else {
1990 unsigned i;
1991
26c949f8
KO
1992 bkey_copy(parent_keys->top, &b->key);
1993 bkey_copy_key(parent_keys->top, &ZERO_KEY);
cafe5635
KO
1994
1995 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1996 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
1997
26c949f8 1998 SET_PTR_GEN(parent_keys->top, i, g);
cafe5635
KO
1999 }
2000
26c949f8 2001 bch_keylist_push(parent_keys);
cafe5635
KO
2002 closure_sync(&op->cl);
2003 atomic_inc(&b->c->prio_blocked);
2004 }
2005
2006 rw_unlock(true, n1);
2007 btree_node_free(b, op);
2008
169ef1cf 2009 bch_time_stats_update(&b->c->btree_split_time, start_time);
cafe5635
KO
2010
2011 return 0;
2012err_free2:
2013 __bkey_put(n2->c, &n2->key);
2014 btree_node_free(n2, op);
2015 rw_unlock(true, n2);
2016err_free1:
2017 __bkey_put(n1->c, &n1->key);
2018 btree_node_free(n1, op);
2019 rw_unlock(true, n1);
2020err:
2021 if (n3 == ERR_PTR(-EAGAIN) ||
2022 n2 == ERR_PTR(-EAGAIN) ||
2023 n1 == ERR_PTR(-EAGAIN))
2024 return -EAGAIN;
2025
2026 pr_warn("couldn't split");
2027 return -ENOMEM;
2028}
2029
26c949f8
KO
2030static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2031 struct keylist *insert_keys)
cafe5635 2032{
26c949f8
KO
2033 int ret = 0;
2034 struct keylist split_keys;
cafe5635 2035
26c949f8 2036 bch_keylist_init(&split_keys);
cafe5635 2037
26c949f8 2038 BUG_ON(b->level);
cafe5635 2039
26c949f8
KO
2040 do {
2041 if (should_split(b)) {
2042 if (current->bio_list) {
2043 op->lock = b->c->root->level + 1;
2044 ret = -EAGAIN;
2045 } else if (op->lock <= b->c->root->level) {
2046 op->lock = b->c->root->level + 1;
2047 ret = -EINTR;
2048 } else {
2049 struct btree *parent = b->parent;
cafe5635 2050
26c949f8
KO
2051 ret = btree_split(b, op, insert_keys,
2052 &split_keys);
2053 insert_keys = &split_keys;
2054 b = parent;
403b6cde
KO
2055 if (!ret)
2056 ret = -EINTR;
cafe5635 2057 }
26c949f8
KO
2058 } else {
2059 BUG_ON(write_block(b) != b->sets[b->nsets].data);
cafe5635 2060
26c949f8
KO
2061 if (bch_btree_insert_keys(b, op, insert_keys)) {
2062 if (!b->level)
2063 bch_btree_leaf_dirty(b, op);
2064 else
2065 bch_btree_node_write(b, &op->cl);
2066 }
cafe5635 2067 }
26c949f8 2068 } while (!bch_keylist_empty(&split_keys));
cafe5635 2069
26c949f8
KO
2070 return ret;
2071}
cafe5635 2072
e7c590eb
KO
2073int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2074 struct bkey *check_key)
2075{
2076 int ret = -EINTR;
2077 uint64_t btree_ptr = b->key.ptr[0];
2078 unsigned long seq = b->seq;
2079 struct keylist insert;
2080 bool upgrade = op->lock == -1;
2081
2082 bch_keylist_init(&insert);
2083
2084 if (upgrade) {
2085 rw_unlock(false, b);
2086 rw_lock(true, b, b->level);
2087
2088 if (b->key.ptr[0] != btree_ptr ||
2089 b->seq != seq + 1)
2090 goto out;
2091 }
2092
2093 SET_KEY_PTRS(check_key, 1);
2094 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2095
2096 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2097
2098 bch_keylist_add(&insert, check_key);
2099
2100 BUG_ON(op->type != BTREE_INSERT);
2101
2102 ret = bch_btree_insert_node(b, op, &insert);
2103
2104 BUG_ON(!ret && !bch_keylist_empty(&insert));
2105out:
2106 if (upgrade)
2107 downgrade_write(&b->lock);
2108 return ret;
2109}
2110
4f3d4014
KO
2111static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2112 struct keylist *keys)
26c949f8 2113{
4f3d4014 2114 if (bch_keylist_empty(keys))
403b6cde
KO
2115 return 0;
2116
26c949f8 2117 if (b->level) {
4f3d4014 2118 struct bkey *k;
cafe5635 2119
c2f95ae2 2120 k = bch_next_recurse_key(b, &START_KEY(keys->keys));
26c949f8
KO
2121 if (!k) {
2122 btree_bug(b, "no key to recurse on at level %i/%i",
2123 b->level, b->c->root->level);
cafe5635 2124
c2f95ae2 2125 bch_keylist_reset(keys);
26c949f8 2126 return -EIO;
57943511 2127 }
cafe5635 2128
4f3d4014 2129 return btree(insert_recurse, k, b, op, keys);
26c949f8 2130 } else {
4f3d4014 2131 return bch_btree_insert_node(b, op, keys);
26c949f8 2132 }
cafe5635
KO
2133}
2134
4f3d4014
KO
2135int bch_btree_insert(struct btree_op *op, struct cache_set *c,
2136 struct keylist *keys)
cafe5635
KO
2137{
2138 int ret = 0;
cafe5635
KO
2139
2140 /*
2141 * Don't want to block with the btree locked unless we have to,
2142 * otherwise we get deadlocks with try_harder and between split/gc
2143 */
2144 clear_closure_blocking(&op->cl);
2145
4f3d4014 2146 BUG_ON(bch_keylist_empty(keys));
cafe5635 2147
4f3d4014 2148 while (!bch_keylist_empty(keys)) {
403b6cde 2149 op->lock = 0;
4f3d4014 2150 ret = btree_root(insert_recurse, c, op, keys);
cafe5635
KO
2151
2152 if (ret == -EAGAIN) {
2153 ret = 0;
2154 closure_sync(&op->cl);
2155 } else if (ret) {
2156 struct bkey *k;
2157
2158 pr_err("error %i trying to insert key for %s",
2159 ret, op_type(op));
2160
4f3d4014 2161 while ((k = bch_keylist_pop(keys)))
cafe5635
KO
2162 bkey_put(c, k, 0);
2163 }
2164 }
2165
cafe5635
KO
2166 return ret;
2167}
2168
2169void bch_btree_set_root(struct btree *b)
2170{
2171 unsigned i;
e49c7c37
KO
2172 struct closure cl;
2173
2174 closure_init_stack(&cl);
cafe5635 2175
c37511b8
KO
2176 trace_bcache_btree_set_root(b);
2177
cafe5635
KO
2178 BUG_ON(!b->written);
2179
2180 for (i = 0; i < KEY_PTRS(&b->key); i++)
2181 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2182
2183 mutex_lock(&b->c->bucket_lock);
2184 list_del_init(&b->list);
2185 mutex_unlock(&b->c->bucket_lock);
2186
2187 b->c->root = b;
2188 __bkey_put(b->c, &b->key);
2189
e49c7c37
KO
2190 bch_journal_meta(b->c, &cl);
2191 closure_sync(&cl);
cafe5635
KO
2192}
2193
2194/* Cache lookup */
2195
2196static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
2197 struct bkey *k)
2198{
2199 struct search *s = container_of(op, struct search, op);
2200 struct bio *bio = &s->bio.bio;
2201 int ret = 0;
2202
2203 while (!ret &&
2204 !op->lookup_done) {
2205 unsigned sectors = INT_MAX;
2206
2207 if (KEY_INODE(k) == op->inode) {
2208 if (KEY_START(k) <= bio->bi_sector)
2209 break;
2210
2211 sectors = min_t(uint64_t, sectors,
2212 KEY_START(k) - bio->bi_sector);
2213 }
2214
2215 ret = s->d->cache_miss(b, s, bio, sectors);
2216 }
2217
2218 return ret;
2219}
2220
2221/*
2222 * Read from a single key, handling the initial cache miss if the key starts in
2223 * the middle of the bio
2224 */
2225static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2226 struct bkey *k)
2227{
2228 struct search *s = container_of(op, struct search, op);
2229 struct bio *bio = &s->bio.bio;
2230 unsigned ptr;
2231 struct bio *n;
2232
2233 int ret = submit_partial_cache_miss(b, op, k);
2234 if (ret || op->lookup_done)
2235 return ret;
2236
2237 /* XXX: figure out best pointer - for multiple cache devices */
2238 ptr = 0;
2239
2240 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
2241
2242 while (!op->lookup_done &&
2243 KEY_INODE(k) == op->inode &&
2244 bio->bi_sector < KEY_OFFSET(k)) {
2245 struct bkey *bio_key;
2246 sector_t sector = PTR_OFFSET(k, ptr) +
2247 (bio->bi_sector - KEY_START(k));
2248 unsigned sectors = min_t(uint64_t, INT_MAX,
2249 KEY_OFFSET(k) - bio->bi_sector);
2250
2251 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
cafe5635
KO
2252 if (n == bio)
2253 op->lookup_done = true;
2254
2255 bio_key = &container_of(n, struct bbio, bio)->key;
2256
2257 /*
2258 * The bucket we're reading from might be reused while our bio
2259 * is in flight, and we could then end up reading the wrong
2260 * data.
2261 *
2262 * We guard against this by checking (in cache_read_endio()) if
2263 * the pointer is stale again; if so, we treat it as an error
2264 * and reread from the backing device (but we don't pass that
2265 * error up anywhere).
2266 */
2267
2268 bch_bkey_copy_single_ptr(bio_key, k, ptr);
2269 SET_PTR_OFFSET(bio_key, 0, sector);
2270
2271 n->bi_end_io = bch_cache_read_endio;
2272 n->bi_private = &s->cl;
2273
cafe5635
KO
2274 __bch_submit_bbio(n, b->c);
2275 }
2276
2277 return 0;
2278}
2279
2280int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2281{
2282 struct search *s = container_of(op, struct search, op);
2283 struct bio *bio = &s->bio.bio;
2284
2285 int ret = 0;
2286 struct bkey *k;
2287 struct btree_iter iter;
2288 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2289
cafe5635
KO
2290 do {
2291 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2292 if (!k) {
2293 /*
2294 * b->key would be exactly what we want, except that
2295 * pointers to btree nodes have nonzero size - we
2296 * wouldn't go far enough
2297 */
2298
2299 ret = submit_partial_cache_miss(b, op,
2300 &KEY(KEY_INODE(&b->key),
2301 KEY_OFFSET(&b->key), 0));
2302 break;
2303 }
2304
2305 ret = b->level
2306 ? btree(search_recurse, k, b, op)
2307 : submit_partial_cache_hit(b, op, k);
2308 } while (!ret &&
2309 !op->lookup_done);
2310
2311 return ret;
2312}
2313
2314/* Keybuf code */
2315
2316static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2317{
2318 /* Overlapping keys compare equal */
2319 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2320 return -1;
2321 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2322 return 1;
2323 return 0;
2324}
2325
2326static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2327 struct keybuf_key *r)
2328{
2329 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2330}
2331
2332static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
72c27061
KO
2333 struct keybuf *buf, struct bkey *end,
2334 keybuf_pred_fn *pred)
cafe5635
KO
2335{
2336 struct btree_iter iter;
2337 bch_btree_iter_init(b, &iter, &buf->last_scanned);
2338
2339 while (!array_freelist_empty(&buf->freelist)) {
2340 struct bkey *k = bch_btree_iter_next_filter(&iter, b,
2341 bch_ptr_bad);
2342
2343 if (!b->level) {
2344 if (!k) {
2345 buf->last_scanned = b->key;
2346 break;
2347 }
2348
2349 buf->last_scanned = *k;
2350 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2351 break;
2352
72c27061 2353 if (pred(buf, k)) {
cafe5635
KO
2354 struct keybuf_key *w;
2355
cafe5635
KO
2356 spin_lock(&buf->lock);
2357
2358 w = array_alloc(&buf->freelist);
2359
2360 w->private = NULL;
2361 bkey_copy(&w->key, k);
2362
2363 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2364 array_free(&buf->freelist, w);
2365
2366 spin_unlock(&buf->lock);
2367 }
2368 } else {
2369 if (!k)
2370 break;
2371
72c27061 2372 btree(refill_keybuf, k, b, op, buf, end, pred);
cafe5635
KO
2373 /*
2374 * Might get an error here, but can't really do anything
2375 * and it'll get logged elsewhere. Just read what we
2376 * can.
2377 */
2378
2379 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2380 break;
2381
2382 cond_resched();
2383 }
2384 }
2385
2386 return 0;
2387}
2388
2389void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
72c27061 2390 struct bkey *end, keybuf_pred_fn *pred)
cafe5635
KO
2391{
2392 struct bkey start = buf->last_scanned;
2393 struct btree_op op;
2394 bch_btree_op_init_stack(&op);
2395
2396 cond_resched();
2397
72c27061 2398 btree_root(refill_keybuf, c, &op, buf, end, pred);
cafe5635
KO
2399 closure_sync(&op.cl);
2400
2401 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
2402 RB_EMPTY_ROOT(&buf->keys) ? "no" :
2403 array_freelist_empty(&buf->freelist) ? "some" : "a few",
2404 KEY_INODE(&start), KEY_OFFSET(&start),
2405 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
2406
2407 spin_lock(&buf->lock);
2408
2409 if (!RB_EMPTY_ROOT(&buf->keys)) {
2410 struct keybuf_key *w;
2411 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2412 buf->start = START_KEY(&w->key);
2413
2414 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2415 buf->end = w->key;
2416 } else {
2417 buf->start = MAX_KEY;
2418 buf->end = MAX_KEY;
2419 }
2420
2421 spin_unlock(&buf->lock);
2422}
2423
2424static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2425{
2426 rb_erase(&w->node, &buf->keys);
2427 array_free(&buf->freelist, w);
2428}
2429
2430void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2431{
2432 spin_lock(&buf->lock);
2433 __bch_keybuf_del(buf, w);
2434 spin_unlock(&buf->lock);
2435}
2436
2437bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2438 struct bkey *end)
2439{
2440 bool ret = false;
2441 struct keybuf_key *p, *w, s;
2442 s.key = *start;
2443
2444 if (bkey_cmp(end, &buf->start) <= 0 ||
2445 bkey_cmp(start, &buf->end) >= 0)
2446 return false;
2447
2448 spin_lock(&buf->lock);
2449 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2450
2451 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2452 p = w;
2453 w = RB_NEXT(w, node);
2454
2455 if (p->private)
2456 ret = true;
2457 else
2458 __bch_keybuf_del(buf, p);
2459 }
2460
2461 spin_unlock(&buf->lock);
2462 return ret;
2463}
2464
2465struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2466{
2467 struct keybuf_key *w;
2468 spin_lock(&buf->lock);
2469
2470 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2471
2472 while (w && w->private)
2473 w = RB_NEXT(w, node);
2474
2475 if (w)
2476 w->private = ERR_PTR(-EINTR);
2477
2478 spin_unlock(&buf->lock);
2479 return w;
2480}
2481
2482struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2483 struct keybuf *buf,
72c27061
KO
2484 struct bkey *end,
2485 keybuf_pred_fn *pred)
cafe5635
KO
2486{
2487 struct keybuf_key *ret;
2488
2489 while (1) {
2490 ret = bch_keybuf_next(buf);
2491 if (ret)
2492 break;
2493
2494 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2495 pr_debug("scan finished");
2496 break;
2497 }
2498
72c27061 2499 bch_refill_keybuf(c, buf, end, pred);
cafe5635
KO
2500 }
2501
2502 return ret;
2503}
2504
72c27061 2505void bch_keybuf_init(struct keybuf *buf)
cafe5635 2506{
cafe5635
KO
2507 buf->last_scanned = MAX_KEY;
2508 buf->keys = RB_ROOT;
2509
2510 spin_lock_init(&buf->lock);
2511 array_allocator_init(&buf->freelist);
2512}
2513
2514void bch_btree_exit(void)
2515{
2516 if (btree_io_wq)
2517 destroy_workqueue(btree_io_wq);
2518 if (bch_gc_wq)
2519 destroy_workqueue(bch_gc_wq);
2520}
2521
2522int __init bch_btree_init(void)
2523{
2524 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
2525 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
2526 return -ENOMEM;
2527
2528 return 0;
2529}