From 2baba25019ec564cd247af74013873d69a0b8190 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 18 Dec 2009 13:51:57 -0800 Subject: [PATCH] ceph: writeback congestion control Set bdi congestion bit when amount of write data in flight exceeds adjustable threshold. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/addr.c | 35 +++++++++++++++++++++++++++++++++-- fs/ceph/debugfs.c | 33 +++++++++++++++++++++++++++++++++ fs/ceph/super.c | 36 ++++++++++++++++++++++++++++++++++++ fs/ceph/super.h | 3 +++ 4 files changed, 105 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d0cdceb0b90b..a6850a14038e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -47,6 +47,12 @@ * accounting is preserved. */ +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + + /* * Dirty a page. Optimistically adjust accounting, on the assumption @@ -377,6 +383,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct inode *inode; struct ceph_inode_info *ci; + struct ceph_client *client; struct ceph_osd_client *osdc; loff_t page_off = page->index << PAGE_CACHE_SHIFT; int len = PAGE_CACHE_SIZE; @@ -384,6 +391,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) int err = 0; struct ceph_snap_context *snapc; u64 snap_size = 0; + long writeback_stat; dout("writepage %p idx %lu\n", page, page->index); @@ -393,7 +401,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } inode = page->mapping->host; ci = ceph_inode(inode); - osdc = &ceph_inode_to_client(inode)->osdc; + client = ceph_inode_to_client(inode); + osdc = &client->osdc; /* verify this is a writeable snap context */ snapc = (void *)page->private; @@ -420,6 +429,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%u\n", inode, page, page->index, page_off, len); + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > + CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, @@ -499,6 +513,8 @@ static void writepages_finish(struct ceph_osd_request *req, struct writeback_control *wbc = req->r_wbc; __s32 rc = -EIO; u64 bytes = 0; + struct ceph_client *client = ceph_inode_to_client(inode); + long writeback_stat; /* parse reply */ replyhead = msg->front.iov_base; @@ -524,6 +540,13 @@ static void writepages_finish(struct ceph_osd_request *req, BUG_ON(!page); WARN_ON(!PageUptodate(page)); + writeback_stat = + atomic_long_dec_return(&client->writeback_count); + if (writeback_stat < + CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) + clear_bdi_congested(&client->backing_dev_info, + BLK_RW_ASYNC); + if (i >= wrote) { dout("inode %p skipping page %p\n", inode, page); wbc->pages_skipped++; @@ -666,6 +689,7 @@ retry: u64 offset, len; struct ceph_osd_request_head *reqhead; struct ceph_osd_op *op; + long writeback_stat; next = 0; locked_pages = 0; @@ -773,6 +797,12 @@ get_more_pages: first = i; dout("%p will write page %p idx %lu\n", inode, page, page->index); + + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + } + set_page_writeback(page); req->r_pages[locked_pages] = page; locked_pages++; @@ -998,7 +1028,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = &client->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 441484ab7e94..22d3b47fb1be 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -320,6 +320,30 @@ DEFINE_SHOW_FUNC(osdc_show) DEFINE_SHOW_FUNC(dentry_lru_show) DEFINE_SHOW_FUNC(caps_show) +static int congestion_kb_set(void *data, u64 val) +{ + struct ceph_client *client = (struct ceph_client *)data; + + if (client) + client->mount_args->congestion_kb = (int)val; + + return 0; +} + +static int congestion_kb_get(void *data, u64 *val) +{ + struct ceph_client *client = (struct ceph_client *)data; + + if (client) + *val = (u64)client->mount_args->congestion_kb; + + return 0; +} + + +DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, + congestion_kb_set, "%llu\n"); + int __init ceph_debugfs_init(void) { ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); @@ -409,6 +433,14 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_caps) goto out; + client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", + 0600, + client->debugfs_dir, + client, + &congestion_kb_fops); + if (!client->debugfs_congestion_kb) + goto out; + sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, name); @@ -431,6 +463,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) debugfs_remove(client->osdc.debugfs_file); debugfs_remove(client->mdsc.debugfs_file); debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_congestion_kb); debugfs_remove(client->debugfs_dir); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6d02a166f8ff..b9cb8cebcdc1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -150,6 +150,35 @@ static void ceph_inode_init_once(void *foo) inode_init_once(&ci->vfs_inode); } +static int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} + static int __init init_caches(void) { ceph_inode_cachep = kmem_cache_create("ceph_inode_info", @@ -267,6 +296,7 @@ enum { Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_readdir_max_entries, + Opt_congestion_kb, Opt_last_int, /* int args above */ Opt_snapdirname, @@ -295,6 +325,7 @@ static match_table_t arg_tokens = { {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_name, "name=%s"}, @@ -342,6 +373,7 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; args->max_readdir = 1024; + args->congestion_kb = default_congestion_kb(); /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ err = -EINVAL; @@ -445,6 +477,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_readdir_max_entries: args->max_readdir = intval; break; + case Opt_congestion_kb: + args->congestion_kb = intval; + break; case Opt_noshare: args->flags |= CEPH_OPT_NOSHARE; @@ -516,6 +551,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) client->msgr = NULL; client->mount_err = 0; + atomic_long_set(&client->writeback_count, 0); err = bdi_init(&client->backing_dev_info); if (err < 0) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2304bd2844a4..62d9ae482d72 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -59,6 +59,7 @@ struct ceph_mount_args { int wsize; int rsize; /* max readahead */ int max_readdir; /* max readdir size */ + int congestion_kb; /* max readdir size */ int osd_timeout; char *snapdir_name; /* default ".snap" */ char *name; @@ -136,6 +137,7 @@ struct ceph_client { struct workqueue_struct *wb_wq; struct workqueue_struct *pg_inv_wq; struct workqueue_struct *trunc_wq; + atomic_long_t writeback_count; struct backing_dev_info backing_dev_info; @@ -143,6 +145,7 @@ struct ceph_client { struct dentry *debugfs_monmap; struct dentry *debugfs_mdsmap, *debugfs_osdmap; struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; #endif }; -- 2.20.1