#.c.o:
# $(CC) $(CFLAGS) -c $<
-ctree : $(objects)
- gcc $(CFLAGS) -o ctree $(objects)
+all: tester debug-tree
+
+debug-tree: $(objects) debug-tree.o
+ gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o
+
+tester: $(objects) random-test.o
+ gcc $(CFLAGS) -o tester $(objects) random-test.o
$(objects) : $(headers)
clean :
rm ctree *.o
+
return 0;
}
+/* some sample code to insert,search & delete items */
+#if 0
/* for testing only */
int next_key(int i, int max_key) {
return rand() % max_key;
//return i;
}
-
int main() {
- struct ctree_root *root;
struct key ins;
struct key last = { (u64)-1, 0, 0};
char *buf;
int tree_size = 0;
struct ctree_path path;
struct ctree_super_block super;
+ struct ctree_root *root;
radix_tree_init();
close_ctree(root);
return 0;
}
+#endif
#ifndef __CTREE__
#define __CTREE__
-#define CTREE_BLOCKSIZE 4096
+#define CTREE_BLOCKSIZE 1024
+/*
+ * the key defines the order in the tree, and so it also defines (optimal)
+ * block layout. objectid corresonds to the inode number. The flags
+ * tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with flags of 1 might refer to the inode
+ * data, flags of 2 may point to file data in the btree and flags == 3
+ * may point to extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ */
struct key {
u64 objectid;
u32 flags;
u64 offset;
} __attribute__ ((__packed__));
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
struct header {
u64 fsid[2]; /* FS specific uuid */
- u64 blocknr;
- u64 parentid;
+ u64 blocknr; /* which block this node is supposed to live in */
+ u64 parentid; /* objectid of the tree root */
u32 csum;
u32 ham;
u16 nritems;
u16 flags;
+ /* generation flags to be added */
} __attribute__ ((__packed__));
#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
struct tree_buffer;
+/*
+ * in ram representation of the tree. extent_root is used for all allocations
+ * and for the extent tree extent_root root. current_insert is used
+ * only for the extent tree.
+ */
struct ctree_root {
struct tree_buffer *node;
struct ctree_root *extent_root;
struct radix_tree_root cache_radix;
};
+/*
+ * describes a tree on disk
+ */
struct ctree_root_info {
u64 fsid[2]; /* FS specific uuid */
u64 blocknr; /* blocknr of this block */
u64 objectid; /* inode number of this root */
- u64 tree_root; /* the tree root */
+ u64 tree_root; /* the tree root block */
u32 csum;
u32 ham;
u64 snapuuid[2]; /* root specific uuid */
} __attribute__ ((__packed__));
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
struct ctree_super_block {
struct ctree_root_info root_info;
struct ctree_root_info extent_info;
} __attribute__ ((__packed__));
+/*
+ * A leaf is full of items. The exact type of item is defined by
+ * the key flags parameter. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
struct item {
struct key key;
u16 offset;
u16 size;
} __attribute__ ((__packed__));
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header))
struct leaf {
struct header header;
};
} __attribute__ ((__packed__));
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
struct node {
struct header header;
struct key keys[NODEPTRS_PER_BLOCK];
u64 blockptrs[NODEPTRS_PER_BLOCK];
} __attribute__ ((__packed__));
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
struct extent_item {
u32 refs;
u64 owner;
} __attribute__ ((__packed__));
+/*
+ * ctree_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
struct ctree_path {
struct tree_buffer *nodes[MAX_LEVEL];
int slots[MAX_LEVEL];
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+int main() {
+ struct ctree_super_block super;
+ struct ctree_root *root;
+ radix_tree_init();
+ root = open_ctree("dbfile", &super);
+ printf("root tree\n");
+ print_tree(root, root->node);
+ printf("map tree\n");
+ print_tree(root->extent_root, root->extent_root->node);
+ return 0;
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ * pending extents are blocks that we're trying to allocate in the extent
+ * map while trying to grow the map because of other allocations. To avoid
+ * recursing, they are tagged in the radix tree and cleaned up after
+ * other allocations are done. The pending tag is also used in the same
+ * manner for deletes.
+ */
+#define CTREE_EXTENT_PENDING 0
+
+/*
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
+ */
+static int del_pending_extents(struct ctree_root *extent_root)
+{
+ int ret;
+ struct key key;
+ struct tree_buffer *gang[4];
+ int i;
+ struct ctree_path path;
+
+ while(1) {
+ ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ CTREE_EXTENT_PENDING);
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++) {
+ key.objectid = gang[i]->blocknr;
+ key.flags = 0;
+ key.offset = 1;
+ init_path(&path);
+ ret = search_slot(extent_root, &key, &path, 0);
+ if (ret) {
+ print_tree(extent_root, extent_root->node);
+ printf("unable to find %lu\n", key.objectid);
+ BUG();
+ // FIXME undo it and return sane
+ return ret;
+ }
+ ret = del_item(extent_root, &path);
+ if (ret) {
+ BUG();
+ return ret;
+ }
+ release_path(extent_root, &path);
+ radix_tree_tag_clear(&extent_root->cache_radix,
+ gang[i]->blocknr,
+ CTREE_EXTENT_PENDING);
+ tree_block_release(extent_root, gang[i]);
+ }
+ }
+ return 0;
+}
+
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
+{
+ struct ctree_path path;
+ struct key key;
+ struct ctree_root *extent_root = root->extent_root;
+ struct tree_buffer *t;
+ int pending_ret;
+ int ret;
+ key.objectid = blocknr;
+ key.flags = 0;
+ key.offset = num_blocks;
+ if (root == extent_root) {
+ t = read_tree_block(root, key.objectid);
+ radix_tree_tag_set(&root->cache_radix, key.objectid,
+ CTREE_EXTENT_PENDING);
+ return 0;
+ }
+ init_path(&path);
+ ret = search_slot(extent_root, &key, &path, 0);
+ if (ret) {
+ print_tree(extent_root, extent_root->node);
+ printf("failed to find %lu\n", key.objectid);
+ BUG();
+ }
+ ret = del_item(extent_root, &path);
+ if (ret)
+ BUG();
+ release_path(extent_root, &path);
+ pending_ret = del_pending_extents(root->extent_root);
+ return ret ? ret : pending_ret;
+}
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = 0
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
+ u64 search_start, u64 search_end, struct key *ins)
+{
+ struct ctree_path path;
+ struct key *key;
+ int ret;
+ u64 hole_size = 0;
+ int slot = 0;
+ u64 last_block;
+ int start_found;
+ struct leaf *l;
+ struct ctree_root * root = orig_root->extent_root;
+
+check_failed:
+ init_path(&path);
+ ins->objectid = search_start;
+ ins->offset = 0;
+ ins->flags = 0;
+ start_found = 0;
+ ret = search_slot(root, ins, &path, 0);
+ while (1) {
+ l = &path.nodes[0]->leaf;
+ slot = path.slots[0];
+ if (slot >= l->header.nritems) {
+ ret = next_leaf(root, &path);
+ if (ret == 0)
+ continue;
+ if (!start_found) {
+ ins->objectid = search_start;
+ ins->offset = num_blocks;
+ start_found = 1;
+ goto check_pending;
+ }
+ ins->objectid = last_block > search_start ?
+ last_block : search_start;
+ ins->offset = num_blocks;
+ goto check_pending;
+ }
+ key = &l->items[slot].key;
+ if (key->objectid >= search_start) {
+ if (start_found) {
+ hole_size = key->objectid - last_block;
+ if (hole_size > num_blocks) {
+ ins->objectid = last_block;
+ ins->offset = num_blocks;
+ goto check_pending;
+ }
+ } else
+ start_found = 1;
+ last_block = key->objectid + key->offset;
+ }
+ path.slots[0]++;
+ }
+ // FIXME -ENOSPC
+check_pending:
+ /* we have to make sure we didn't find an extent that has already
+ * been allocated by the map tree or the original allocation
+ */
+ release_path(root, &path);
+ BUG_ON(ins->objectid < search_start);
+ if (orig_root->extent_root == orig_root) {
+ BUG_ON(num_blocks != 1);
+ if ((root->current_insert.objectid <= ins->objectid &&
+ root->current_insert.objectid +
+ root->current_insert.offset > ins->objectid) ||
+ (root->current_insert.objectid > ins->objectid &&
+ root->current_insert.objectid <= ins->objectid +
+ ins->offset) ||
+ radix_tree_tag_get(&root->cache_radix, ins->objectid,
+ CTREE_EXTENT_PENDING)) {
+ search_start = ins->objectid + 1;
+ goto check_failed;
+ }
+ }
+ if (ins->offset != 1)
+ BUG();
+ return 0;
+}
+
+/*
+ * insert all of the pending extents reserved during the original
+ * allocation. (CTREE_EXTENT_PENDING). Returns zero if it all worked out
+ */
+static int insert_pending_extents(struct ctree_root *extent_root)
+{
+ int ret;
+ struct key key;
+ struct extent_item item;
+ struct tree_buffer *gang[4];
+ int i;
+
+ // FIXME -ENOSPC
+ item.refs = 1;
+ item.owner = extent_root->node->node.header.parentid;
+ while(1) {
+ ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ CTREE_EXTENT_PENDING);
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++) {
+ key.objectid = gang[i]->blocknr;
+ key.flags = 0;
+ key.offset = 1;
+ ret = insert_item(extent_root, &key, &item,
+ sizeof(item));
+ if (ret) {
+ BUG();
+ // FIXME undo it and return sane
+ return ret;
+ }
+ radix_tree_tag_clear(&extent_root->cache_radix,
+ gang[i]->blocknr,
+ CTREE_EXTENT_PENDING);
+ tree_block_release(extent_root, gang[i]);
+ }
+ }
+ return 0;
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
+ u64 search_end, u64 owner, struct key *ins,
+ struct tree_buffer **buf)
+{
+ int ret;
+ int pending_ret;
+ struct extent_item extent_item;
+ extent_item.refs = 1;
+ extent_item.owner = owner;
+
+ ret = find_free_extent(root, num_blocks, search_start, search_end, ins);
+ if (ret)
+ return ret;
+ if (root != root->extent_root) {
+ memcpy(&root->extent_root->current_insert, ins, sizeof(*ins));
+ ret = insert_item(root->extent_root, ins, &extent_item,
+ sizeof(extent_item));
+ memset(&root->extent_root->current_insert, 0,
+ sizeof(struct key));
+ pending_ret = insert_pending_extents(root->extent_root);
+ if (ret)
+ return ret;
+ if (pending_ret)
+ return pending_ret;
+ *buf = find_tree_block(root, ins->objectid);
+ return 0;
+ }
+ /* we're allocating an extent for the extent tree, don't recurse */
+ BUG_ON(ins->offset != 1);
+ *buf = find_tree_block(root, ins->objectid);
+ BUG_ON(!*buf);
+ radix_tree_tag_set(&root->cache_radix, ins->objectid,
+ CTREE_EXTENT_PENDING);
+ (*buf)->count++;
+ return 0;
+
+}
+
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
+{
+ struct key ins;
+ int ret;
+ struct tree_buffer *buf = NULL;
+
+ ret = alloc_extent(root, 1, 0, (unsigned long)-1,
+ root->node->node.header.parentid,
+ &ins, &buf);
+
+ if (ret) {
+ BUG();
+ return NULL;
+ }
+ if (root != root->extent_root)
+ BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix,
+ buf->blocknr, CTREE_EXTENT_PENDING));
+ return buf;
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+int keep_running = 1;
+
+static int setup_key(struct radix_tree_root *root, struct key *key, int exists)
+{
+ int num = rand();
+ unsigned long res[2];
+ int ret;
+
+ key->flags = 0;
+ key->offset = 0;
+again:
+ ret = radix_tree_gang_lookup(root, (void **)res, num, 2);
+ if (exists) {
+ if (ret == 0)
+ return -1;
+ num = res[0];
+ } else if (ret != 0 && num == res[0]) {
+ num++;
+ if (ret > 1 && num == res[1]) {
+ num++;
+ goto again;
+ }
+ }
+ key->objectid = num;
+ return 0;
+}
+
+static int ins_one(struct ctree_root *root, struct radix_tree_root *radix)
+{
+ struct ctree_path path;
+ struct key key;
+ int ret;
+ char buf[128];
+ init_path(&path);
+ ret = setup_key(radix, &key, 0);
+ sprintf(buf, "str-%lu\n", key.objectid);
+ ret = insert_item(root, &key, buf, strlen(buf));
+ if (ret)
+ goto error;
+ radix_tree_preload(GFP_KERNEL);
+ ret = radix_tree_insert(radix, key.objectid,
+ (void *)key.objectid);
+ radix_tree_preload_end();
+ if (ret)
+ goto error;
+ return ret;
+error:
+ printf("failed to insert %lu\n", key.objectid);
+ return -1;
+}
+
+static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
+{
+ struct ctree_path path;
+ struct key key;
+ int ret;
+ char buf[128];
+ init_path(&path);
+ ret = setup_key(radix, &key, 1);
+ if (ret < 0)
+ return 0;
+ sprintf(buf, "str-%lu\n", key.objectid);
+ ret = insert_item(root, &key, buf, strlen(buf));
+ if (ret != -EEXIST) {
+ printf("insert on %lu gave us %d\n", key.objectid, ret);
+ return 1;
+ }
+ return 0;
+}
+
+static int del_one(struct ctree_root *root, struct radix_tree_root *radix)
+{
+ struct ctree_path path;
+ struct key key;
+ int ret;
+ unsigned long *ptr;
+ init_path(&path);
+ ret = setup_key(radix, &key, 1);
+ if (ret < 0)
+ return 0;
+ ret = search_slot(root, &key, &path, -1);
+ if (ret)
+ goto error;
+ ret = del_item(root, &path);
+ release_path(root, &path);
+ if (ret != 0)
+ goto error;
+ ptr = radix_tree_delete(radix, key.objectid);
+ if (!ptr)
+ goto error;
+ return 0;
+error:
+ printf("failed to delete %lu\n", key.objectid);
+ return -1;
+}
+
+static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix)
+{
+ struct ctree_path path;
+ struct key key;
+ int ret;
+ init_path(&path);
+ ret = setup_key(radix, &key, 1);
+ if (ret < 0)
+ return 0;
+ ret = search_slot(root, &key, &path, 0);
+ release_path(root, &path);
+ if (ret)
+ goto error;
+ return 0;
+error:
+ printf("unable to find key %lu\n", key.objectid);
+ return -1;
+}
+
+static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix)
+{
+ struct ctree_path path;
+ struct key key;
+ int ret;
+ init_path(&path);
+ ret = setup_key(radix, &key, 0);
+ if (ret < 0)
+ return ret;
+ ret = search_slot(root, &key, &path, 0);
+ release_path(root, &path);
+ if (ret == 0)
+ goto error;
+ return 0;
+error:
+ printf("able to find key that should not exist %lu\n", key.objectid);
+ return -1;
+}
+
+int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) =
+{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent };
+
+static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
+{
+ struct ctree_path path;
+ struct key key;
+ u64 found;
+ int ret;
+ int slot;
+ int i;
+ key.offset = 0;
+ key.flags = 0;
+ key.objectid = (unsigned long)-1;
+ while(1) {
+ init_path(&path);
+ ret = search_slot(root, &key, &path, 0);
+ slot = path.slots[0];
+ if (ret != 0) {
+ if (slot == 0) {
+ release_path(root, &path);
+ break;
+ }
+ slot -= 1;
+ }
+ for (i = slot; i >= 0; i--) {
+ found = path.nodes[0]->leaf.items[i].key.objectid;
+ radix_tree_preload(GFP_KERNEL);
+ ret = radix_tree_insert(radix, found, (void *)found);
+ if (ret) {
+ fprintf(stderr,
+ "failed to insert %lu into radix\n",
+ found);
+ exit(1);
+ }
+
+ radix_tree_preload_end();
+ }
+ release_path(root, &path);
+ key.objectid = found - 1;
+ if (key.objectid > found)
+ break;
+ }
+ return 0;
+}
+
+void sigstopper(int ignored)
+{
+ keep_running = 0;
+ fprintf(stderr, "caught exit signal, stopping\n");
+}
+
+int print_usage(void)
+{
+ printf("usage: tester [-ih] [-c count] [-f count]\n");
+ printf("\t -c count -- iteration count after filling\n");
+ printf("\t -f count -- run this many random inserts before starting\n");
+ printf("\t -i -- only do initial fill\n");
+ printf("\t -h -- this help text\n");
+ exit(1);
+}
+int main(int ac, char **av)
+{
+ RADIX_TREE(radix, GFP_KERNEL);
+ struct ctree_super_block super;
+ struct ctree_root *root;
+ int i;
+ int ret;
+ int count;
+ int op;
+ int iterations = 20000;
+ int init_fill_count = 800000;
+ int err = 0;
+ int initial_only = 0;
+ radix_tree_init();
+ root = open_ctree("dbfile", &super);
+ fill_radix(root, &radix);
+
+ signal(SIGTERM, sigstopper);
+ signal(SIGINT, sigstopper);
+
+ for (i = 1 ; i < ac ; i++) {
+ if (strcmp(av[i], "-i") == 0) {
+ initial_only = 1;
+ } else if (strcmp(av[i], "-c") == 0) {
+ iterations = atoi(av[i+1]);
+ i++;
+ } else if (strcmp(av[i], "-f") == 0) {
+ init_fill_count = atoi(av[i+1]);
+ i++;
+ } else {
+ print_usage();
+ }
+ }
+ for (i = 0; i < init_fill_count; i++) {
+ ret = ins_one(root, &radix);
+ if (ret) {
+ printf("initial fill failed\n");
+ err = ret;
+ goto out;
+ }
+ if (i % 10000 == 0) {
+ printf("initial fill %d level %d count %d\n", i,
+ node_level(root->node->node.header.flags),
+ root->node->node.header.nritems);
+ }
+ if (keep_running == 0) {
+ err = 0;
+ goto out;
+ }
+ }
+ if (initial_only == 1) {
+ goto out;
+ }
+ for (i = 0; i < iterations; i++) {
+ op = rand() % ARRAY_SIZE(ops);
+ count = rand() % 128;
+ if (i % 2000 == 0) {
+ printf("%d\n", i);
+ fflush(stdout);
+ }
+ if (i && i % 5000 == 0) {
+ printf("open & close, root level %d nritems %d\n",
+ node_level(root->node->node.header.flags),
+ root->node->node.header.nritems);
+ write_ctree_super(root, &super);
+ close_ctree(root);
+ root = open_ctree("dbfile", &super);
+ }
+ while(count--) {
+ ret = ops[op](root, &radix);
+ if (ret) {
+ fprintf(stderr, "op %d failed %d:%d\n",
+ op, i, iterations);
+ print_tree(root, root->node);
+ fprintf(stderr, "op %d failed %d:%d\n",
+ op, i, iterations);
+ err = ret;
+ goto out;
+ }
+ if (keep_running == 0) {
+ err = 0;
+ goto out;
+ }
+ }
+ }
+out:
+ write_ctree_super(root, &super);
+ close_ctree(root);
+ return err;
+}
+