Btrfs: Add backing store, memory management
authorChris Mason <chris.mason@oracle.com>
Fri, 2 Feb 2007 14:18:22 +0000 (09:18 -0500)
committerDavid Woodhouse <dwmw2@hera.kernel.org>
Fri, 2 Feb 2007 14:18:22 +0000 (09:18 -0500)
Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/Makefile
fs/btrfs/ctree.c
fs/btrfs/ctree.h [new file with mode: 0644]
fs/btrfs/disk-io.c [new file with mode: 0644]
fs/btrfs/disk-io.h [new file with mode: 0644]
fs/btrfs/kerncompat.h
fs/btrfs/radix-tree.c [new file with mode: 0644]
fs/btrfs/radix-tree.h [new file with mode: 0644]

index 9f84c08baab2c445cf3b8009ea6635e12507f9af..63360212a075c4ed22f71bfc752d7cfe5802ba61 100644 (file)
@@ -1,7 +1,12 @@
 
-ctree: ctree.o
-       gcc -g -O2 -Wall -o ctree ctree.c
+CFLAGS= -g -Wall
+
+.c.o:
+       $(CC) $(CFLAGS) -c $<
+
+ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h
+       gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o
 
 clean:
-       rm ctree ctree.o
+       rm ctree *.o
 
index 4bf5e92584bd309b2abda1323ca62fa616bcd7f3..6f0522f2108282681ce156ae415e4e336e959f0d 100644 (file)
@@ -1,68 +1,25 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "kerncompat.h"
-
-#define BLOCKSIZE 4096
-
-struct key {
-       u64 objectid;
-       u32 flags;
-       u64 offset;
-} __attribute__ ((__packed__));
-
-struct header {
-       u64 fsid[2]; /* FS specific uuid */
-       u64 blocknum;
-       u64 parentid;
-       u32 csum;
-       u32 ham;
-       u16 nritems;
-       u16 flags;
-} __attribute__ ((__packed__));
-
-#define NODEPTRS_PER_BLOCK ((BLOCKSIZE - sizeof(struct header)) / \
-                           (sizeof(struct key) + sizeof(u64)))
-
-#define LEVEL_BITS 3
-#define MAX_LEVEL (1 << LEVEL_BITS)
-#define node_level(f) ((f) & (MAX_LEVEL-1))
-#define is_leaf(f) (node_level(f) == 0)
-
-struct ctree_root {
-       struct node *node;
-};
-
-struct item {
-       struct key key;
-       u16 offset;
-       u16 size;
-} __attribute__ ((__packed__));
-
-#define LEAF_DATA_SIZE (BLOCKSIZE - sizeof(struct header))
-struct leaf {
-       struct header header;
-       union {
-               struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
-               u8 data[BLOCKSIZE-sizeof(struct header)];
-       };
-} __attribute__ ((__packed__));
-
-struct node {
-       struct header header;
-       struct key keys[NODEPTRS_PER_BLOCK];
-       u64 blockptrs[NODEPTRS_PER_BLOCK];
-} __attribute__ ((__packed__));
-
-struct ctree_path {
-       struct node *nodes[MAX_LEVEL];
-       int slots[MAX_LEVEL];
-};
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
 
 static inline void init_path(struct ctree_path *p)
 {
        memset(p, 0, sizeof(*p));
 }
 
+static void release_path(struct ctree_root *root, struct ctree_path *p)
+{
+       int i;
+       for (i = 0; i < MAX_LEVEL; i++) {
+               if (!p->nodes[i])
+                       break;
+               tree_block_release(root, p->nodes[i]);
+       }
+}
+
 static inline unsigned int leaf_data_end(struct leaf *leaf)
 {
        unsigned int nr = leaf->header.nritems;
@@ -135,26 +92,25 @@ int bin_search(struct node *c, struct key *key, int *slot)
        return -1;
 }
 
-void *read_block(u64 blocknum)
-{
-       return (void *)blocknum;
-}
-
 int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
 {
-       struct node *c = root->node;
+       struct tree_buffer *b = root->node;
+       struct node *c;
+
        int slot;
        int ret;
        int level;
-       while (c) {
+       b->count++;
+       while (b) {
+               c = &b->node;
                level = node_level(c->header.flags);
-               p->nodes[level] = c;
+               p->nodes[level] = b;
                ret = bin_search(c, key, &slot);
                if (!is_leaf(c->header.flags)) {
                        if (ret && slot > 0)
                                slot -= 1;
                        p->slots[level] = slot;
-                       c = read_block(c->blockptrs[slot]);
+                       b = read_tree_block(root, c->blockptrs[slot]);
                        continue;
                } else {
                        p->slots[level] = slot;
@@ -164,17 +120,20 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
        return -1;
 }
 
-static void fixup_low_keys(struct ctree_path *path, struct key *key,
-                            int level)
+static void fixup_low_keys(struct ctree_root *root,
+                          struct ctree_path *path, struct key *key,
+                          int level)
 {
        int i;
        /* adjust the pointers going up the tree */
        for (i = level; i < MAX_LEVEL; i++) {
-               struct node *t = path->nodes[i];
+               struct node *t;
                int tslot = path->slots[i];
-               if (!t)
+               if (!path->nodes[i])
                        break;
+               t = &path->nodes[i]->node;
                memcpy(t->keys + tslot, key, sizeof(*key));
+               write_tree_block(root, path->nodes[i]);
                if (tslot != 0)
                        break;
        }
@@ -190,27 +149,34 @@ int __insert_ptr(struct ctree_root *root,
        int nritems;
        /* need a new root */
        if (!path->nodes[level]) {
-               c = malloc(sizeof(struct node));
+               struct tree_buffer *t;
+               t = alloc_free_block(root);
+               c = &t->node;
                memset(c, 0, sizeof(c));
                c->header.nritems = 2;
                c->header.flags = node_level(level);
-               lower = path->nodes[level-1];
+               c->header.blocknr = t->blocknr;
+               lower = &path->nodes[level-1]->node;
                if (is_leaf(lower->header.flags))
                        lower_key = &((struct leaf *)lower)->items[0].key;
                else
                        lower_key = lower->keys;
                memcpy(c->keys, lower_key, sizeof(struct key));
                memcpy(c->keys + 1, key, sizeof(struct key));
-               c->blockptrs[0] = (u64)lower;
+               c->blockptrs[0] = path->nodes[level-1]->blocknr;
                c->blockptrs[1] = blocknr;
-               root->node = c;
-               path->nodes[level] = c;
+               /* the path has an extra ref to root->node */
+               tree_block_release(root, root->node);
+               root->node = t;
+               t->count++;
+               write_tree_block(root, t);
+               path->nodes[level] = t;
                path->slots[level] = 0;
                if (c->keys[1].objectid == 0)
                        BUG();
                return 0;
        }
-       lower = path->nodes[level];
+       lower = &path->nodes[level]->node;
        nritems = lower->header.nritems;
        if (slot > nritems)
                BUG();
@@ -227,6 +193,7 @@ int __insert_ptr(struct ctree_root *root,
        lower->header.nritems++;
        if (lower->keys[1].objectid == 0)
                        BUG();
+       write_tree_block(root, path->nodes[level]);
        return 0;
 }
 
@@ -238,6 +205,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
        int push_items = 0;
        int left_nritems;
        int right_nritems;
+       struct tree_buffer *t;
+       struct tree_buffer *right_buf;
 
        if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
                return 1;
@@ -245,13 +214,18 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
        if (slot == 0)
                return 1;
 
-       left = read_block(path->nodes[level + 1]->blockptrs[slot - 1]);
-       right = path->nodes[level];
+       t = read_tree_block(root,
+                           path->nodes[level + 1]->node.blockptrs[slot - 1]);
+       left = &t->node;
+       right_buf = path->nodes[level];
+       right = &right_buf->node;
        left_nritems = left->header.nritems;
        right_nritems = right->header.nritems;
        push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
-       if (push_items <= 0)
+       if (push_items <= 0) {
+               tree_block_release(root, t);
                return 1;
+       }
 
        if (right_nritems < push_items)
                push_items = right_nritems;
@@ -267,15 +241,20 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
        left->header.nritems += push_items;
 
        /* adjust the pointers going up the tree */
-       fixup_low_keys(path, right->keys, level + 1);
+       fixup_low_keys(root, path, right->keys, level + 1);
+
+       write_tree_block(root, t);
+       write_tree_block(root, right_buf);
 
        /* then fixup the leaf pointer in the path */
        if (path->slots[level] < push_items) {
                path->slots[level] += left_nritems;
-               path->nodes[level] = (struct node*)left;
+               tree_block_release(root, path->nodes[level]);
+               path->nodes[level] = t;
                path->slots[level + 1] -= 1;
        } else {
                path->slots[level] -= push_items;
+               tree_block_release(root, t);
        }
        return 0;
 }
@@ -283,6 +262,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 {
        int slot;
+       struct tree_buffer *t;
+       struct tree_buffer *src_buffer;
        struct node *dst;
        struct node *src;
        int push_items = 0;
@@ -295,16 +276,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
        if (slot == NODEPTRS_PER_BLOCK - 1)
                return 1;
 
-       if (slot >= path->nodes[level + 1]->header.nritems -1)
+       if (slot >= path->nodes[level + 1]->node.header.nritems -1)
                return 1;
 
-       dst = read_block(path->nodes[level + 1]->blockptrs[slot + 1]);
-       src = path->nodes[level];
+       t = read_tree_block(root,
+                           path->nodes[level + 1]->node.blockptrs[slot + 1]);
+       dst = &t->node;
+       src_buffer = path->nodes[level];
+       src = &src_buffer->node;
        dst_nritems = dst->header.nritems;
        src_nritems = src->header.nritems;
        push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
-       if (push_items <= 0)
+       if (push_items <= 0) {
+               tree_block_release(root, t);
                return 1;
+       }
 
        if (src_nritems < push_items)
                push_items = src_nritems;
@@ -322,13 +308,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
        dst->header.nritems += push_items;
 
        /* adjust the pointers going up the tree */
-       memcpy(path->nodes[level + 1]->keys + path->slots[level + 1] + 1,
+       memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
                dst->keys, sizeof(struct key));
+
+       write_tree_block(root, path->nodes[level + 1]);
+       write_tree_block(root, t);
+       write_tree_block(root, src_buffer);
+
        /* then fixup the leaf pointer in the path */
        if (path->slots[level] >= src->header.nritems) {
                path->slots[level] -= src->header.nritems;
-               path->nodes[level] = (struct node*)dst;
+               tree_block_release(root, path->nodes[level]);
+               path->nodes[level] = t;
                path->slots[level + 1] += 1;
+       } else {
+               tree_block_release(root, t);
        }
        return 0;
 }
@@ -337,15 +331,18 @@ int insert_ptr(struct ctree_root *root,
                struct ctree_path *path, struct key *key,
                u64 blocknr, int level)
 {
-       struct node *c = path->nodes[level];
+       struct tree_buffer *t = path->nodes[level];
+       struct node *c = &path->nodes[level]->node;
        struct node *b;
-       struct node *bal[MAX_LEVEL];
+       struct tree_buffer *b_buffer;
+       struct tree_buffer *bal[MAX_LEVEL];
        int bal_level = level;
        int mid;
        int bal_start = -1;
 
        memset(bal, 0, ARRAY_SIZE(bal));
-       while(c && c->header.nritems == NODEPTRS_PER_BLOCK) {
+       while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
+               c = &t->node;
                if (push_node_left(root, path,
                   node_level(c->header.flags)) == 0)
                        break;
@@ -355,8 +352,10 @@ int insert_ptr(struct ctree_root *root,
                bal_start = bal_level;
                if (bal_level == MAX_LEVEL - 1)
                        BUG();
-               b = malloc(sizeof(struct node));
+               b_buffer = alloc_free_block(root);
+               b = &b_buffer->node;
                b->header.flags = c->header.flags;
+               b->header.blocknr = b_buffer->blocknr;
                mid = (c->header.nritems + 1) / 2;
                memcpy(b->keys, c->keys + mid,
                        (c->header.nritems - mid) * sizeof(struct key));
@@ -364,21 +363,28 @@ int insert_ptr(struct ctree_root *root,
                        (c->header.nritems - mid) * sizeof(u64));
                b->header.nritems = c->header.nritems - mid;
                c->header.nritems = mid;
-               bal[bal_level] = b;
+
+               write_tree_block(root, t);
+               write_tree_block(root, b_buffer);
+
+               bal[bal_level] = b_buffer;
                if (bal_level == MAX_LEVEL - 1)
                        break;
                bal_level += 1;
-               c = path->nodes[bal_level];
+               t = path->nodes[bal_level];
        }
        while(bal_start > 0) {
-               b = bal[bal_start];
-               c = path->nodes[bal_start];
-               __insert_ptr(root, path, b->keys, (u64)b,
+               b_buffer = bal[bal_start];
+               c = &path->nodes[bal_start]->node;
+               __insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr,
                                path->slots[bal_start + 1] + 1, bal_start + 1);
                if (path->slots[bal_start] >= c->header.nritems) {
                        path->slots[bal_start] -= c->header.nritems;
-                       path->nodes[bal_start] = b;
+                       tree_block_release(root, path->nodes[bal_start]);
+                       path->nodes[bal_start] = b_buffer;
                        path->slots[bal_start + 1] += 1;
+               } else {
+                       tree_block_release(root, b_buffer);
                }
                bal_start--;
                if (!bal[bal_start])
@@ -404,7 +410,9 @@ int leaf_space_used(struct leaf *l, int start, int nr)
 int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
                   int data_size)
 {
-       struct leaf *right = (struct leaf *)path->nodes[0];
+       struct tree_buffer *right_buf = path->nodes[0];
+       struct leaf *right = &right_buf->leaf;
+       struct tree_buffer *t;
        struct leaf *left;
        int slot;
        int i;
@@ -421,9 +429,11 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
        if (!path->nodes[1]) {
                return 1;
        }
-       left = read_block(path->nodes[1]->blockptrs[slot - 1]);
+       t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
+       left = &t->leaf;
        free_space = leaf_free_space(left);
        if (free_space < data_size + sizeof(struct item)) {
+               tree_block_release(root, t);
                return 1;
        }
        for (i = 0; i < right->header.nritems; i++) {
@@ -436,6 +446,7 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
                push_space += item->size + sizeof(*item);
        }
        if (push_items == 0) {
+               tree_block_release(root, t);
                return 1;
        }
        /* push data from right to left */
@@ -446,6 +457,8 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
                right->data + right->items[push_items - 1].offset,
                push_space);
        old_left_nritems = left->header.nritems;
+       BUG_ON(old_left_nritems < 0);
+
        for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
                left->items[i].offset -= LEAF_DATA_SIZE -
                        left->items[old_left_nritems -1].offset;
@@ -460,30 +473,40 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
                (right->header.nritems - push_items) * sizeof(struct item));
        right->header.nritems -= push_items;
        push_space = LEAF_DATA_SIZE;
+
        for (i = 0; i < right->header.nritems; i++) {
                right->items[i].offset = push_space - right->items[i].size;
                push_space = right->items[i].offset;
        }
-       fixup_low_keys(path, &right->items[0].key, 1);
+
+       write_tree_block(root, t);
+       write_tree_block(root, right_buf);
+
+       fixup_low_keys(root, path, &right->items[0].key, 1);
 
        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
                path->slots[0] += old_left_nritems;
-               path->nodes[0] = (struct node*)left;
+               tree_block_release(root, path->nodes[0]);
+               path->nodes[0] = t;
                path->slots[1] -= 1;
        } else {
+               tree_block_release(root, t);
                path->slots[0] -= push_items;
        }
+       BUG_ON(path->slots[0] < 0);
        return 0;
 }
 
 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 {
-       struct leaf *l = (struct leaf *)path->nodes[0];
-       int nritems = l->header.nritems;
-       int mid = (nritems + 1)/ 2;
-       int slot = path->slots[0];
+       struct tree_buffer *l_buf = path->nodes[0];
+       struct leaf *l = &l_buf->leaf;
+       int nritems;
+       int mid;
+       int slot;
        struct leaf *right;
+       struct tree_buffer *right_buffer;
        int space_needed = data_size + sizeof(struct item);
        int data_copy_size;
        int rt_data_off;
@@ -491,9 +514,19 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
        int ret;
 
        if (push_leaf_left(root, path, data_size) == 0) {
-               return 0;
+               l_buf = path->nodes[0];
+               l = &l_buf->leaf;
+               if (leaf_free_space(l) >= sizeof(struct item) + data_size)
+                       return 0;
        }
-       right = malloc(sizeof(struct leaf));
+       slot = path->slots[0];
+       nritems = l->header.nritems;
+       mid = (nritems + 1)/ 2;
+
+       right_buffer = alloc_free_block(root);
+       BUG_ON(!right_buffer);
+       BUG_ON(mid == nritems);
+       right = &right_buffer->leaf;
        memset(right, 0, sizeof(*right));
        if (mid <= slot) {
                if (leaf_space_used(l, mid, nritems - mid) + space_needed >
@@ -505,6 +538,8 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
                        BUG();
        }
        right->header.nritems = nritems - mid;
+       right->header.blocknr = right_buffer->blocknr;
+       right->header.flags = node_level(0);
        data_copy_size = l->items[mid].offset + l->items[mid].size -
                         leaf_data_end(l);
        memcpy(right->items, l->items + mid,
@@ -518,12 +553,20 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
        }
        l->header.nritems = mid;
        ret = insert_ptr(root, path, &right->items[0].key,
-                         (u64)right, 1);
+                         right_buffer->blocknr, 1);
+
+       write_tree_block(root, right_buffer);
+       write_tree_block(root, l_buf);
+
+       BUG_ON(path->slots[0] != slot);
        if (mid <= slot) {
-               path->nodes[0] = (struct node *)right;
+               tree_block_release(root, path->nodes[0]);
+               path->nodes[0] = right_buffer;
                path->slots[0] -= mid;
                path->slots[1] += 1;
-       }
+       } else
+               tree_block_release(root, right_buffer);
+       BUG_ON(path->slots[0] < 0);
        return ret;
 }
 
@@ -532,28 +575,48 @@ int insert_item(struct ctree_root *root, struct key *key,
 {
        int ret;
        int slot;
+       int slot_orig;
        struct leaf *leaf;
+       struct tree_buffer *leaf_buf;
        unsigned int nritems;
        unsigned int data_end;
        struct ctree_path path;
 
+       if (!root->node) {
+               struct tree_buffer *t;
+               t = alloc_free_block(root);
+               BUG_ON(!t);
+               t->node.header.nritems = 0;
+               t->node.header.flags = node_level(0);
+               t->node.header.blocknr = t->blocknr;
+               root->node = t;
+               write_tree_block(root, t);
+       }
        init_path(&path);
        ret = search_slot(root, key, &path);
-       if (ret == 0)
+       if (ret == 0) {
+               release_path(root, &path);
                return -EEXIST;
+       }
 
-       leaf = (struct leaf *)path.nodes[0];
-       if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
+       slot_orig = path.slots[0];
+       leaf_buf = path.nodes[0];
+       leaf = &leaf_buf->leaf;
+       if (leaf_free_space(leaf) <  sizeof(struct item) + data_size) {
                split_leaf(root, &path, data_size);
-       leaf = (struct leaf *)path.nodes[0];
+               leaf_buf = path.nodes[0];
+               leaf = &path.nodes[0]->leaf;
+       }
        nritems = leaf->header.nritems;
        data_end = leaf_data_end(leaf);
+
        if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
                BUG();
 
        slot = path.slots[0];
+       BUG_ON(slot < 0);
        if (slot == 0)
-               fixup_low_keys(&path, key, 1);
+               fixup_low_keys(root, &path, key, 1);
        if (slot != nritems) {
                int i;
                unsigned int old_data = leaf->items[slot].offset +
@@ -580,21 +643,25 @@ int insert_item(struct ctree_root *root, struct key *key,
        leaf->items[slot].size = data_size;
        memcpy(leaf->data + data_end - data_size, data, data_size);
        leaf->header.nritems += 1;
+       write_tree_block(root, leaf_buf);
        if (leaf_free_space(leaf) < 0)
                BUG();
+       release_path(root, &path);
        return 0;
 }
 
 int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 {
        int slot;
+       struct tree_buffer *t;
        struct node *node;
        int nritems;
 
        while(1) {
-               node = path->nodes[level];
-               if (!node)
+               t = path->nodes[level];
+               if (!t)
                        break;
+               node = &t->node;
                slot = path->slots[level];
                nritems = node->header.nritems;
 
@@ -606,28 +673,34 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
                                sizeof(u64) * (nritems - slot - 1));
                }
                node->header.nritems--;
+               write_tree_block(root, t);
                if (node->header.nritems != 0) {
                        int tslot;
                        if (slot == 0)
-                               fixup_low_keys(path, node->keys, level + 1);
+                               fixup_low_keys(root, path, node->keys,
+                                              level + 1);
                        tslot = path->slots[level+1];
+                       t->count++;
                        push_node_left(root, path, level);
                        if (node->header.nritems) {
                                push_node_right(root, path, level);
                        }
-                       if (node->header.nritems)
+                       if (node->header.nritems) {
+                               tree_block_release(root, t);
                                break;
+                       }
+                       tree_block_release(root, t);
                        path->slots[level+1] = tslot;
                }
-               if (node == root->node) {
-                       printf("root is now null!\n");
-                       root->node = NULL;
+               if (t == root->node) {
+                       /* just turn the root into a leaf and break */
+                       root->node->node.header.flags = node_level(0);
+                       write_tree_block(root, t);
                        break;
                }
                level++;
                if (!path->nodes[level])
                        BUG();
-               free(node);
        }
        return 0;
 }
@@ -636,10 +709,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 {
        int slot;
        struct leaf *leaf;
+       struct tree_buffer *leaf_buf;
        int doff;
        int dsize;
 
-       leaf = (struct leaf *)path->nodes[0];
+       leaf_buf = path->nodes[0];
+       leaf = &leaf_buf->leaf;
        slot = path->slots[0];
        doff = leaf->items[slot].offset;
        dsize = leaf->items[slot].size;
@@ -658,14 +733,15 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
        }
        leaf->header.nritems -= 1;
        if (leaf->header.nritems == 0) {
-               if (leaf == (struct leaf *)root->node)
-                       root->node = NULL;
-               else
+               if (leaf_buf == root->node) {
+                       leaf->header.flags = node_level(0);
+                       write_tree_block(root, leaf_buf);
+               } else
                        del_ptr(root, path, 1);
-               free(leaf);
        } else {
                if (slot == 0)
-                       fixup_low_keys(path, &leaf->items[0].key, 1);
+                       fixup_low_keys(root, path, &leaf->items[0].key, 1);
+               write_tree_block(root, leaf_buf);
                if (leaf_space_used(leaf, 0, leaf->header.nritems) <
                    LEAF_DATA_SIZE / 4) {
                        /* push_leaf_left fixes the path.
@@ -673,12 +749,13 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
                         * for possible call to del_ptr below
                         */
                        slot = path->slots[1];
+                       leaf_buf->count++;
                        push_leaf_left(root, path, 1);
                        if (leaf->header.nritems == 0) {
-                               free(leaf);
                                path->slots[1] = slot;
                                del_ptr(root, path, 1);
                        }
+                       tree_block_release(root, leaf_buf);
                }
        }
        return 0;
@@ -689,7 +766,7 @@ void print_leaf(struct leaf *l)
        int i;
        int nr = l->header.nritems;
        struct item *item;
-       printf("leaf %p total ptrs %d free space %d\n", l, nr,
+       printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
               leaf_free_space(l));
        fflush(stdout);
        for (i = 0 ; i < nr ; i++) {
@@ -703,38 +780,45 @@ void print_leaf(struct leaf *l)
                fflush(stdout);
        }
 }
-void print_tree(struct node *c)
+void print_tree(struct ctree_root *root, struct tree_buffer *t)
 {
        int i;
        int nr;
+       struct node *c;
 
-       if (!c)
+       if (!t)
                return;
+       c = &t->node;
        nr = c->header.nritems;
+       if (c->header.blocknr != t->blocknr)
+               BUG();
        if (is_leaf(c->header.flags)) {
                print_leaf((struct leaf *)c);
                return;
        }
-       printf("node %p level %d total ptrs %d free spc %lu\n", c,
+       printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
                node_level(c->header.flags), c->header.nritems,
                NODEPTRS_PER_BLOCK - c->header.nritems);
        fflush(stdout);
        for (i = 0; i < nr; i++) {
-               printf("\tkey %d (%lu %u %lu) block %lx\n",
+               printf("\tkey %d (%lu %u %lu) block %lu\n",
                       i,
                       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
                       c->blockptrs[i]);
                fflush(stdout);
        }
        for (i = 0; i < nr; i++) {
-               struct node *next = read_block(c->blockptrs[i]);
+               struct tree_buffer *next_buf = read_tree_block(root,
+                                                           c->blockptrs[i]);
+               struct node *next = &next_buf->node;
                if (is_leaf(next->header.flags) &&
                    node_level(c->header.flags) != 1)
                        BUG();
                if (node_level(next->header.flags) !=
                        node_level(c->header.flags) - 1)
                        BUG();
-               print_tree(next);
+               print_tree(root, next_buf);
+               tree_block_release(root, next_buf);
        }
 
 }
@@ -746,23 +830,24 @@ int next_key(int i, int max_key) {
 }
 
 int main() {
-       struct leaf *first_node = malloc(sizeof(struct leaf));
-       struct ctree_root root;
+       struct ctree_root *root;
        struct key ins;
        struct key last = { (u64)-1, 0, 0};
        char *buf;
        int i;
        int num;
        int ret;
-       int run_size = 100000;
+       int run_size = 1000000;
        int max_key = 100000000;
        int tree_size = 0;
        struct ctree_path path;
 
+       radix_tree_init();
+
+
+       root = open_ctree("dbfile");
 
        srand(55);
-       root.node = (struct node *)first_node;
-       memset(first_node, 0, sizeof(*first_node));
        for (i = 0; i < run_size; i++) {
                buf = malloc(64);
                num = next_key(i, max_key);
@@ -772,39 +857,46 @@ int main() {
                ins.objectid = num;
                ins.offset = 0;
                ins.flags = 0;
-               ret = insert_item(&root, &ins, buf, strlen(buf));
+               ret = insert_item(root, &ins, buf, strlen(buf));
                if (!ret)
                        tree_size++;
        }
+       close_ctree(root);
+       root = open_ctree("dbfile");
+       printf("starting search\n");
        srand(55);
        for (i = 0; i < run_size; i++) {
                num = next_key(i, max_key);
                ins.objectid = num;
                init_path(&path);
-               ret = search_slot(&root, &ins, &path);
+               ret = search_slot(root, &ins, &path);
                if (ret) {
-                       print_tree(root.node);
+                       print_tree(root, root->node);
                        printf("unable to find %d\n", num);
                        exit(1);
                }
-       }
-       printf("node %p level %d total ptrs %d free spc %lu\n", root.node,
-               node_level(root.node->header.flags), root.node->header.nritems,
-               NODEPTRS_PER_BLOCK - root.node->header.nritems);
-       // print_tree(root.node);
-       printf("all searches good\n");
+               release_path(root, &path);
+       }
+       close_ctree(root);
+       root = open_ctree("dbfile");
+       printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
+               node_level(root->node->node.header.flags),
+               root->node->node.header.nritems,
+               NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
+       printf("all searches good, deleting some items\n");
        i = 0;
        srand(55);
        for (i = 0 ; i < run_size/4; i++) {
                num = next_key(i, max_key);
                ins.objectid = num;
                init_path(&path);
-               ret = search_slot(&root, &ins, &path);
+               ret = search_slot(root, &ins, &path);
                if (ret)
                        continue;
-               ret = del_item(&root, &path);
+               ret = del_item(root, &path);
                if (ret != 0)
                        BUG();
+               release_path(root, &path);
                tree_size--;
        }
        srand(128);
@@ -813,38 +905,58 @@ int main() {
                num = next_key(i, max_key);
                sprintf(buf, "string-%d", num);
                ins.objectid = num;
-               ret = insert_item(&root, &ins, buf, strlen(buf));
+               ret = insert_item(root, &ins, buf, strlen(buf));
                if (!ret)
                        tree_size++;
        }
-       while(root.node) {
+       close_ctree(root);
+       root = open_ctree("dbfile");
+       printf("starting search2\n");
+       srand(128);
+       for (i = 0; i < run_size; i++) {
+               num = next_key(i, max_key);
+               ins.objectid = num;
+               init_path(&path);
+               ret = search_slot(root, &ins, &path);
+               if (ret) {
+                       print_tree(root, root->node);
+                       printf("unable to find %d\n", num);
+                       exit(1);
+               }
+               release_path(root, &path);
+       }
+       printf("starting big long delete run\n");
+       while(root->node && root->node->node.header.nritems > 0) {
                struct leaf *leaf;
                int slot;
                ins.objectid = (u64)-1;
                init_path(&path);
-               ret = search_slot(&root, &ins, &path);
+               ret = search_slot(root, &ins, &path);
                if (ret == 0)
                        BUG();
 
-               leaf = (struct leaf *)(path.nodes[0]);
+               leaf = &path.nodes[0]->leaf;
                slot = path.slots[0];
                if (slot != leaf->header.nritems)
                        BUG();
                while(path.slots[0] > 0) {
                        path.slots[0] -= 1;
                        slot = path.slots[0];
-                       leaf = (struct leaf *)(path.nodes[0]);
+                       leaf = &path.nodes[0]->leaf;
 
                        if (comp_keys(&last, &leaf->items[slot].key) <= 0)
                                BUG();
                        memcpy(&last, &leaf->items[slot].key, sizeof(last));
-                       ret = del_item(&root, &path);
-                       if (ret != 0)
+                       ret = del_item(root, &path);
+                       if (ret != 0) {
+                               printf("del_item returned %d\n", ret);
                                BUG();
+                       }
                        tree_size--;
                }
+               release_path(root, &path);
        }
-       print_tree(root.node);
+       close_ctree(root);
        printf("tree size is now %d\n", tree_size);
        return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644 (file)
index 0000000..586bf18
--- /dev/null
@@ -0,0 +1,62 @@
+#ifndef __CTREE__
+#define __CTREE__
+
+#define CTREE_BLOCKSIZE 4096
+
+struct key {
+       u64 objectid;
+       u32 flags;
+       u64 offset;
+} __attribute__ ((__packed__));
+
+struct header {
+       u64 fsid[2]; /* FS specific uuid */
+       u64 blocknr;
+       u64 parentid;
+       u32 csum;
+       u32 ham;
+       u16 nritems;
+       u16 flags;
+} __attribute__ ((__packed__));
+
+#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
+                           (sizeof(struct key) + sizeof(u64)))
+
+#define LEVEL_BITS 3
+#define MAX_LEVEL (1 << LEVEL_BITS)
+#define node_level(f) ((f) & (MAX_LEVEL-1))
+#define is_leaf(f) (node_level(f) == 0)
+
+struct tree_buffer;
+struct ctree_root {
+       struct tree_buffer *node;
+       int fp;
+       struct radix_tree_root cache_radix;
+};
+
+struct item {
+       struct key key;
+       u16 offset;
+       u16 size;
+} __attribute__ ((__packed__));
+
+#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header))
+struct leaf {
+       struct header header;
+       union {
+               struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
+               u8 data[CTREE_BLOCKSIZE-sizeof(struct header)];
+       };
+} __attribute__ ((__packed__));
+
+struct node {
+       struct header header;
+       struct key keys[NODEPTRS_PER_BLOCK];
+       u64 blockptrs[NODEPTRS_PER_BLOCK];
+} __attribute__ ((__packed__));
+
+struct ctree_path {
+       struct tree_buffer *nodes[MAX_LEVEL];
+       int slots[MAX_LEVEL];
+};
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644 (file)
index 0000000..8d51a07
--- /dev/null
@@ -0,0 +1,174 @@
+#define _XOPEN_SOURCE 500
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
+static int allocated_blocks = 0;
+
+struct ctree_header {
+       u64 root_block;
+} __attribute__ ((__packed__));
+
+static int get_free_block(struct ctree_root *root, u64 *block)
+{
+       struct stat st;
+       int ret;
+
+       st.st_size = 0;
+       ret = fstat(root->fp, &st);
+       if (st.st_size > sizeof(struct ctree_header)) {
+               *block = (st.st_size -
+                       sizeof(struct ctree_header)) / CTREE_BLOCKSIZE;
+       } else {
+               *block = 0;
+       }
+       ret = ftruncate(root->fp, sizeof(struct ctree_header) + (*block + 1) *
+                       CTREE_BLOCKSIZE);
+       return ret;
+}
+
+struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
+{
+       struct tree_buffer *buf;
+       int ret;
+       buf = malloc(sizeof(struct tree_buffer));
+       if (!buf)
+               return buf;
+       allocated_blocks++;
+       buf->blocknr = blocknr;
+       buf->count = 1;
+       radix_tree_preload(GFP_KERNEL);
+       ret = radix_tree_insert(&root->cache_radix, blocknr, buf);
+       radix_tree_preload_end();
+       if (ret) {
+               free(buf);
+               return NULL;
+       }
+       return buf;
+}
+
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
+{
+       u64 free_block;
+       int ret;
+       struct tree_buffer * buf;
+       ret = get_free_block(root, &free_block);
+       if (ret) {
+               BUG();
+               return NULL;
+       }
+       buf = alloc_tree_block(root, free_block);
+       if (!buf)
+               BUG();
+       return buf;
+}
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
+{
+       loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+       struct tree_buffer *buf;
+       int ret;
+
+       buf = radix_tree_lookup(&root->cache_radix, blocknr);
+       if (buf) {
+               buf->count++;
+               if (buf->blocknr != blocknr)
+                       BUG();
+               if (buf->blocknr != buf->node.header.blocknr)
+                       BUG();
+               return buf;
+       }
+       buf = alloc_tree_block(root, blocknr);
+       if (!buf)
+               return NULL;
+       ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+       if (ret != CTREE_BLOCKSIZE) {
+               free(buf);
+               return NULL;
+       }
+       if (buf->blocknr != buf->node.header.blocknr)
+               BUG();
+       return buf;
+}
+
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+{
+       u64 blocknr = buf->blocknr;
+       loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+       int ret;
+
+       if (buf->blocknr != buf->node.header.blocknr)
+               BUG();
+       ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+       if (ret != CTREE_BLOCKSIZE)
+               return ret;
+       if (buf == root->node)
+               return update_root_block(root);
+       return 0;
+}
+
+struct ctree_root *open_ctree(char *filename)
+{
+       struct ctree_root *root = malloc(sizeof(struct ctree_root));
+       int fp;
+       u64 root_block;
+       int ret;
+
+       fp = open(filename, O_CREAT | O_RDWR);
+       if (fp < 0) {
+               free(root);
+               return NULL;
+       }
+       root->fp = fp;
+       INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+       ret = pread(fp, &root_block, sizeof(u64), 0);
+       if (ret == sizeof(u64)) {
+               printf("reading root node at block %lu\n", root_block);
+               root->node = read_tree_block(root, root_block);
+       } else
+               root->node = NULL;
+       return root;
+}
+
+int close_ctree(struct ctree_root *root)
+{
+       close(root->fp);
+       if (root->node)
+               tree_block_release(root, root->node);
+       free(root);
+       printf("on close %d blocks are allocated\n", allocated_blocks);
+       return 0;
+}
+
+int update_root_block(struct ctree_root *root)
+{
+       int ret;
+       u64 root_block = root->node->blocknr;
+
+       ret = pwrite(root->fp, &root_block, sizeof(u64), 0);
+       if (ret != sizeof(u64))
+               return ret;
+       return 0;
+}
+
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
+{
+       buf->count--;
+       if (buf->count == 0) {
+               if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
+                       BUG();
+               radix_tree_delete(&root->cache_radix, buf->blocknr);
+               memset(buf, 0, sizeof(*buf));
+               free(buf);
+               BUG_ON(allocated_blocks == 0);
+               allocated_blocks--;
+       }
+}
+
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644 (file)
index 0000000..ee95fa0
--- /dev/null
@@ -0,0 +1,21 @@
+#ifndef __DISKIO__
+#define __DISKIO__
+
+struct tree_buffer {
+       u64 blocknr;
+       int count;
+       union {
+               struct node node;
+               struct leaf leaf;
+       };
+};
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
+struct ctree_root *open_ctree(char *filename);
+int close_ctree(struct ctree_root *root);
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
+struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int update_root_block(struct ctree_root *root);
+
+#endif
index 3a4bb4d661f972a459ee67a34424b07c241be3b4..347ca06205c236f045a88d6af45b3d62b28e144b 100644 (file)
@@ -6,6 +6,7 @@
 #define BITS_PER_LONG 64
 #define __GFP_BITS_SHIFT 20
 #define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1))
+#define GFP_KERNEL 0
 #define __read_mostly
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define __force
diff --git a/fs/btrfs/radix-tree.c b/fs/btrfs/radix-tree.c
new file mode 100644 (file)
index 0000000..baa25ca
--- /dev/null
@@ -0,0 +1,836 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "kerncompat.h"
+#include "radix-tree.h"
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT   (CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT   3       /* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE    (1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK    (RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS   \
+       ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+       unsigned int    count;
+       void            *slots[RADIX_TREE_MAP_SIZE];
+       unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
+struct radix_tree_path {
+       struct radix_tree_node *node;
+       int offset;
+};
+
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
+
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
+
+/*
+ * Per-cpu pool of preloaded nodes
+ */
+struct radix_tree_preload {
+       int nr;
+       struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
+};
+struct radix_tree_preload radix_tree_preloads = { 0, };
+
+static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+{
+       return root->gfp_mask & __GFP_BITS_MASK;
+}
+
+static int internal_nodes = 0;
+/*
+ * This assumes that the caller has performed appropriate preallocation, and
+ * that the caller has pinned this thread of control to the current CPU.
+ */
+static struct radix_tree_node *
+radix_tree_node_alloc(struct radix_tree_root *root)
+{
+       struct radix_tree_node *ret;
+       ret = malloc(sizeof(struct radix_tree_node));
+       if (ret) {
+               memset(ret, 0, sizeof(struct radix_tree_node));
+               internal_nodes++;
+       }
+       return ret;
+}
+
+static inline void
+radix_tree_node_free(struct radix_tree_node *node)
+{
+       internal_nodes--;
+       free(node);
+}
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+       struct radix_tree_preload *rtp;
+       struct radix_tree_node *node;
+       int ret = -ENOMEM;
+
+       preempt_disable();
+       rtp = &__get_cpu_var(radix_tree_preloads);
+       while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
+               preempt_enable();
+               node = radix_tree_node_alloc(NULL);
+               if (node == NULL)
+                       goto out;
+               preempt_disable();
+               rtp = &__get_cpu_var(radix_tree_preloads);
+               if (rtp->nr < ARRAY_SIZE(rtp->nodes))
+                       rtp->nodes[rtp->nr++] = node;
+               else
+                       radix_tree_node_free(node);
+       }
+       ret = 0;
+out:
+       return ret;
+}
+
+static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
+               int offset)
+{
+       __set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+               int offset)
+{
+       __clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+               int offset)
+{
+       return test_bit(offset, node->tags[tag]);
+}
+
+static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+       root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+       root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear_all(struct radix_tree_root *root)
+{
+       root->gfp_mask &= __GFP_BITS_MASK;
+}
+
+static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+       return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+       int idx;
+       for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+               if (node->tags[tag][idx])
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ *     Return the maximum key which can be store into a
+ *     radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+       return height_to_maxindex[height];
+}
+
+/*
+ *     Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+{
+       struct radix_tree_node *node;
+       unsigned int height;
+       int tag;
+
+       /* Figure out what the height should be.  */
+       height = root->height + 1;
+       while (index > radix_tree_maxindex(height))
+               height++;
+
+       if (root->rnode == NULL) {
+               root->height = height;
+               goto out;
+       }
+
+       do {
+               if (!(node = radix_tree_node_alloc(root)))
+                       return -ENOMEM;
+
+               /* Increase the height.  */
+               node->slots[0] = root->rnode;
+
+               /* Propagate the aggregated tag info into the new root */
+               for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+                       if (root_tag_get(root, tag))
+                               tag_set(node, tag, 0);
+               }
+
+               node->count = 1;
+               root->rnode = node;
+               root->height++;
+       } while (height > root->height);
+out:
+       return 0;
+}
+
+/**
+ *     radix_tree_insert    -    insert into a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *     @item:          item to insert
+ *
+ *     Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root,
+                       unsigned long index, void *item)
+{
+       struct radix_tree_node *node = NULL, *slot;
+       unsigned int height, shift;
+       int offset;
+       int error;
+
+       /* Make sure the tree is high enough.  */
+       if (index > radix_tree_maxindex(root->height)) {
+               error = radix_tree_extend(root, index);
+               if (error)
+                       return error;
+       }
+
+       slot = root->rnode;
+       height = root->height;
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+       offset = 0;                     /* uninitialised var warning */
+       while (height > 0) {
+               if (slot == NULL) {
+                       /* Have to add a child node.  */
+                       if (!(slot = radix_tree_node_alloc(root)))
+                               return -ENOMEM;
+                       if (node) {
+                               node->slots[offset] = slot;
+                               node->count++;
+                       } else
+                               root->rnode = slot;
+               }
+
+               /* Go a level down */
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               node = slot;
+               slot = node->slots[offset];
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+
+       if (slot != NULL)
+               return -EEXIST;
+
+       if (node) {
+               node->count++;
+               node->slots[offset] = item;
+               BUG_ON(tag_get(node, 0, offset));
+               BUG_ON(tag_get(node, 1, offset));
+       } else {
+               root->rnode = item;
+               BUG_ON(root_tag_get(root, 0));
+               BUG_ON(root_tag_get(root, 1));
+       }
+
+       return 0;
+}
+
+static inline void **__lookup_slot(struct radix_tree_root *root,
+                                  unsigned long index)
+{
+       unsigned int height, shift;
+       struct radix_tree_node **slot;
+
+       height = root->height;
+
+       if (index > radix_tree_maxindex(height))
+               return NULL;
+
+       if (height == 0 && root->rnode)
+               return (void **)&root->rnode;
+
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+       slot = &root->rnode;
+
+       while (height > 0) {
+               if (*slot == NULL)
+                       return NULL;
+
+               slot = (struct radix_tree_node **)
+                       ((*slot)->slots +
+                               ((index >> shift) & RADIX_TREE_MAP_MASK));
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+
+       return (void **)slot;
+}
+
+/**
+ *     radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *
+ *     Lookup the slot corresponding to the position @index in the radix tree
+ *     @root. This is useful for update-if-exists operations.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+       return __lookup_slot(root, index);
+}
+
+/**
+ *     radix_tree_lookup    -    perform lookup operation on a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *
+ *     Lookup the item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+       void **slot;
+
+       slot = __lookup_slot(root, index);
+       return slot != NULL ? *slot : NULL;
+}
+
+/**
+ *     radix_tree_tag_set - set a tag on a radix tree node
+ *     @root:          radix tree root
+ *     @index:         index key
+ *     @tag:           tag index
+ *
+ *     Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *     corresponding to @index in the radix tree.  From
+ *     the root all the way down to the leaf node.
+ *
+ *     Returns the address of the tagged item.   Setting a tag on a not-present
+ *     item is a bug.
+ */
+void *radix_tree_tag_set(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag)
+{
+       unsigned int height, shift;
+       struct radix_tree_node *slot;
+
+       height = root->height;
+       BUG_ON(index > radix_tree_maxindex(height));
+
+       slot = root->rnode;
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+
+       while (height > 0) {
+               int offset;
+
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               if (!tag_get(slot, tag, offset))
+                       tag_set(slot, tag, offset);
+               slot = slot->slots[offset];
+               BUG_ON(slot == NULL);
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+
+       /* set the root's tag bit */
+       if (slot && !root_tag_get(root, tag))
+               root_tag_set(root, tag);
+
+       return slot;
+}
+
+/**
+ *     radix_tree_tag_clear - clear a tag on a radix tree node
+ *     @root:          radix tree root
+ *     @index:         index key
+ *     @tag:           tag index
+ *
+ *     Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *     corresponding to @index in the radix tree.  If
+ *     this causes the leaf node to have no tags set then clear the tag in the
+ *     next-to-leaf node, etc.
+ *
+ *     Returns the address of the tagged item on success, else NULL.  ie:
+ *     has the same return value and semantics as radix_tree_lookup().
+ */
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag)
+{
+       struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+       struct radix_tree_node *slot = NULL;
+       unsigned int height, shift;
+
+       height = root->height;
+       if (index > radix_tree_maxindex(height))
+               goto out;
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       pathp->node = NULL;
+       slot = root->rnode;
+
+       while (height > 0) {
+               int offset;
+
+               if (slot == NULL)
+                       goto out;
+
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               pathp[1].offset = offset;
+               pathp[1].node = slot;
+               slot = slot->slots[offset];
+               pathp++;
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+
+       if (slot == NULL)
+               goto out;
+
+       while (pathp->node) {
+               if (!tag_get(pathp->node, tag, pathp->offset))
+                       goto out;
+               tag_clear(pathp->node, tag, pathp->offset);
+               if (any_tag_set(pathp->node, tag))
+                       goto out;
+               pathp--;
+       }
+
+       /* clear the root's tag bit */
+       if (root_tag_get(root, tag))
+               root_tag_clear(root, tag);
+
+out:
+       return slot;
+}
+
+#ifndef __KERNEL__     /* Only the test harness uses this at present */
+/**
+ * radix_tree_tag_get - get a tag on a radix tree node
+ * @root:              radix tree root
+ * @index:             index key
+ * @tag:               tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ * Return values:
+ *
+ *  0: tag not present or not set
+ *  1: tag set
+ */
+int radix_tree_tag_get(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag)
+{
+       unsigned int height, shift;
+       struct radix_tree_node *slot;
+       int saw_unset_tag = 0;
+
+       height = root->height;
+       if (index > radix_tree_maxindex(height))
+               return 0;
+
+       /* check the root's tag bit */
+       if (!root_tag_get(root, tag))
+               return 0;
+
+       if (height == 0)
+               return 1;
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       slot = root->rnode;
+
+       for ( ; ; ) {
+               int offset;
+
+               if (slot == NULL)
+                       return 0;
+
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+               /*
+                * This is just a debug check.  Later, we can bale as soon as
+                * we see an unset tag.
+                */
+               if (!tag_get(slot, tag, offset))
+                       saw_unset_tag = 1;
+               if (height == 1) {
+                       int ret = tag_get(slot, tag, offset);
+
+                       BUG_ON(ret && saw_unset_tag);
+                       return !!ret;
+               }
+               slot = slot->slots[offset];
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+}
+#endif
+
+static unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+       unsigned int max_items, unsigned long *next_index)
+{
+       unsigned int nr_found = 0;
+       unsigned int shift, height;
+       struct radix_tree_node *slot;
+       unsigned long i;
+
+       height = root->height;
+       if (height == 0) {
+               if (root->rnode && index == 0)
+                       results[nr_found++] = root->rnode;
+               goto out;
+       }
+
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+       slot = root->rnode;
+
+       for ( ; height > 1; height--) {
+
+               for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
+                               i < RADIX_TREE_MAP_SIZE; i++) {
+                       if (slot->slots[i] != NULL)
+                               break;
+                       index &= ~((1UL << shift) - 1);
+                       index += 1UL << shift;
+                       if (index == 0)
+                               goto out;       /* 32-bit wraparound */
+               }
+               if (i == RADIX_TREE_MAP_SIZE)
+                       goto out;
+
+               shift -= RADIX_TREE_MAP_SHIFT;
+               slot = slot->slots[i];
+       }
+
+       /* Bottom level: grab some items */
+       for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+               index++;
+               if (slot->slots[i]) {
+                       results[nr_found++] = slot->slots[i];
+                       if (nr_found == max_items)
+                               goto out;
+               }
+       }
+out:
+       *next_index = index;
+       return nr_found;
+}
+
+/**
+ *     radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *     @root:          radix tree root
+ *     @results:       where the results of the lookup are placed
+ *     @first_index:   start the lookup from this key
+ *     @max_items:     place up to this many items at *results
+ *
+ *     Performs an index-ascending scan of the tree for present items.  Places
+ *     them at *@results and returns the number of items which were placed at
+ *     *@results.
+ *
+ *     The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned int max_items)
+{
+       const unsigned long max_index = radix_tree_maxindex(root->height);
+       unsigned long cur_index = first_index;
+       unsigned int ret = 0;
+
+       while (ret < max_items) {
+               unsigned int nr_found;
+               unsigned long next_index;       /* Index of next search */
+
+               if (cur_index > max_index)
+                       break;
+               nr_found = __lookup(root, results + ret, cur_index,
+                                       max_items - ret, &next_index);
+               ret += nr_found;
+               if (next_index == 0)
+                       break;
+               cur_index = next_index;
+       }
+       return ret;
+}
+
+/*
+ * FIXME: the two tag_get()s here should use find_next_bit() instead of
+ * open-coding the search.
+ */
+static unsigned int
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+       unsigned int max_items, unsigned long *next_index, unsigned int tag)
+{
+       unsigned int nr_found = 0;
+       unsigned int shift;
+       unsigned int height = root->height;
+       struct radix_tree_node *slot;
+
+       if (height == 0) {
+               if (root->rnode && index == 0)
+                       results[nr_found++] = root->rnode;
+               goto out;
+       }
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       slot = root->rnode;
+
+       do {
+               unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+               for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+                       if (tag_get(slot, tag, i)) {
+                               BUG_ON(slot->slots[i] == NULL);
+                               break;
+                       }
+                       index &= ~((1UL << shift) - 1);
+                       index += 1UL << shift;
+                       if (index == 0)
+                               goto out;       /* 32-bit wraparound */
+               }
+               if (i == RADIX_TREE_MAP_SIZE)
+                       goto out;
+               height--;
+               if (height == 0) {      /* Bottom level: grab some items */
+                       unsigned long j = index & RADIX_TREE_MAP_MASK;
+
+                       for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+                               index++;
+                               if (tag_get(slot, tag, j)) {
+                                       BUG_ON(slot->slots[j] == NULL);
+                                       results[nr_found++] = slot->slots[j];
+                                       if (nr_found == max_items)
+                                               goto out;
+                               }
+                       }
+               }
+               shift -= RADIX_TREE_MAP_SHIFT;
+               slot = slot->slots[i];
+       } while (height > 0);
+out:
+       *next_index = index;
+       return nr_found;
+}
+
+/**
+ *     radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
+ *                                  based on a tag
+ *     @root:          radix tree root
+ *     @results:       where the results of the lookup are placed
+ *     @first_index:   start the lookup from this key
+ *     @max_items:     place up to this many items at *results
+ *     @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ *     Performs an index-ascending scan of the tree for present items which
+ *     have the tag indexed by @tag set.  Places the items at *@results and
+ *     returns the number of items which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+               unsigned long first_index, unsigned int max_items,
+               unsigned int tag)
+{
+       const unsigned long max_index = radix_tree_maxindex(root->height);
+       unsigned long cur_index = first_index;
+       unsigned int ret = 0;
+
+       /* check the root's tag bit */
+       if (!root_tag_get(root, tag))
+               return 0;
+
+       while (ret < max_items) {
+               unsigned int nr_found;
+               unsigned long next_index;       /* Index of next search */
+
+               if (cur_index > max_index)
+                       break;
+               nr_found = __lookup_tag(root, results + ret, cur_index,
+                                       max_items - ret, &next_index, tag);
+               ret += nr_found;
+               if (next_index == 0)
+                       break;
+               cur_index = next_index;
+       }
+       return ret;
+}
+
+/**
+ *     radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *     @root           radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root)
+{
+       /* try to shrink tree height */
+       while (root->height > 0 &&
+                       root->rnode->count == 1 &&
+                       root->rnode->slots[0]) {
+               struct radix_tree_node *to_free = root->rnode;
+
+               root->rnode = to_free->slots[0];
+               root->height--;
+               /* must only free zeroed nodes into the slab */
+               tag_clear(to_free, 0, 0);
+               tag_clear(to_free, 1, 0);
+               to_free->slots[0] = NULL;
+               to_free->count = 0;
+               radix_tree_node_free(to_free);
+       }
+}
+
+/**
+ *     radix_tree_delete    -    delete an item from a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *
+ *     Remove the item at @index from the radix tree rooted at @root.
+ *
+ *     Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+       struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+       struct radix_tree_node *slot = NULL;
+       unsigned int height, shift;
+       int tag;
+       int offset;
+
+       height = root->height;
+       if (index > radix_tree_maxindex(height))
+               goto out;
+
+       slot = root->rnode;
+       if (height == 0 && root->rnode) {
+               root_tag_clear_all(root);
+               root->rnode = NULL;
+               goto out;
+       }
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       pathp->node = NULL;
+
+       do {
+               if (slot == NULL)
+                       goto out;
+
+               pathp++;
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               pathp->offset = offset;
+               pathp->node = slot;
+               slot = slot->slots[offset];
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       } while (height > 0);
+
+       if (slot == NULL)
+               goto out;
+
+       /*
+        * Clear all tags associated with the just-deleted item
+        */
+       for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+               if (tag_get(pathp->node, tag, pathp->offset))
+                       radix_tree_tag_clear(root, index, tag);
+       }
+
+       /* Now free the nodes we do not need anymore */
+       while (pathp->node) {
+               pathp->node->slots[pathp->offset] = NULL;
+               pathp->node->count--;
+
+               if (pathp->node->count) {
+                       if (pathp->node == root->rnode)
+                               radix_tree_shrink(root);
+                       goto out;
+               }
+
+               /* Node with zero slots in use so free it */
+               radix_tree_node_free(pathp->node);
+
+               pathp--;
+       }
+       root_tag_clear_all(root);
+       root->height = 0;
+       root->rnode = NULL;
+
+out:
+       return slot;
+}
+
+/**
+ *     radix_tree_tagged - test whether any items in the tree are tagged
+ *     @root:          radix tree root
+ *     @tag:           tag to test
+ */
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
+{
+       return root_tag_get(root, tag);
+}
+
+static unsigned long __maxindex(unsigned int height)
+{
+       unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+       unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+       if (tmp >= RADIX_TREE_INDEX_BITS)
+               index = ~0UL;
+       return index;
+}
+
+static void radix_tree_init_maxindex(void)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
+               height_to_maxindex[i] = __maxindex(i);
+}
+
+void radix_tree_init(void)
+{
+       radix_tree_init_maxindex();
+}
diff --git a/fs/btrfs/radix-tree.h b/fs/btrfs/radix-tree.h
new file mode 100644 (file)
index 0000000..c3ce881
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _LINUX_RADIX_TREE_H
+#define _LINUX_RADIX_TREE_H
+
+#define RADIX_TREE_MAX_TAGS 2
+
+/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
+struct radix_tree_root {
+       unsigned int            height;
+       gfp_t                   gfp_mask;
+       struct radix_tree_node  *rnode;
+};
+
+#define RADIX_TREE_INIT(mask)  {                                       \
+       .height = 0,                                                    \
+       .gfp_mask = (mask),                                             \
+       .rnode = NULL,                                                  \
+}
+
+#define RADIX_TREE(name, mask) \
+       struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+#define INIT_RADIX_TREE(root, mask)                                    \
+do {                                                                   \
+       (root)->height = 0;                                             \
+       (root)->gfp_mask = (mask);                                      \
+       (root)->rnode = NULL;                                           \
+} while (0)
+
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned int max_items);
+int radix_tree_preload(gfp_t gfp_mask);
+void radix_tree_init(void);
+void *radix_tree_tag_set(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag);
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag);
+int radix_tree_tag_get(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag);
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+               unsigned long first_index, unsigned int max_items,
+               unsigned int tag);
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+
+static inline void radix_tree_preload_end(void)
+{
+       preempt_enable();
+}
+
+#endif /* _LINUX_RADIX_TREE_H */