[PATCH] nfs: fix congestion control
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Fri, 16 Mar 2007 21:38:26 +0000 (13:38 -0800)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Sat, 17 Mar 2007 02:25:05 +0000 (19:25 -0700)
The current NFS client congestion logic is severly broken, it marks the
backing device congested during each nfs_writepages() call but doesn't
mirror this in nfs_writepage() which makes for deadlocks.  Also it
implements its own waitqueue.

Replace this by a more regular congestion implementation that puts a cap on
the number of active writeback pages and uses the bdi congestion waitqueue.

Also always use an interruptible wait since it makes sense to be able to
SIGKILL the process even for mounts without 'intr'.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/nfs/super.c
fs/nfs/sysctl.c
fs/nfs/write.c
include/linux/backing-dev.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
mm/backing-dev.c

index bb516a2cfbafa4f50446b4ca5e99e1d8c6303831..f1eae44b9a1aec5167dfefb934437ccaedba2576 100644 (file)
@@ -151,10 +151,10 @@ int __init register_nfs_fs(void)
        if (ret < 0)
                goto error_0;
 
-#ifdef CONFIG_NFS_V4
        ret = nfs_register_sysctl();
        if (ret < 0)
                goto error_1;
+#ifdef CONFIG_NFS_V4
        ret = register_filesystem(&nfs4_fs_type);
        if (ret < 0)
                goto error_2;
@@ -165,9 +165,9 @@ int __init register_nfs_fs(void)
 #ifdef CONFIG_NFS_V4
 error_2:
        nfs_unregister_sysctl();
+#endif
 error_1:
        unregister_filesystem(&nfs_fs_type);
-#endif
 error_0:
        return ret;
 }
index fcdcafbb32939931c111d3fe75458d71f6f676ff..b62481dabae907a7f11e56c39f2179b4cc389819 100644 (file)
@@ -50,6 +50,14 @@ static ctl_table nfs_cb_sysctls[] = {
                .proc_handler   = &proc_dointvec_jiffies,
                .strategy       = &sysctl_jiffies,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "nfs_congestion_kb",
+               .data           = &nfs_congestion_kb,
+               .maxlen         = sizeof(nfs_congestion_kb),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
        { .ctl_name = 0 }
 };
 
index febdade91670562b003a43fd3e0ee8ad900445eb..2867e6b7096f006c174ca58a087c584f3cc561b2 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
+#include <linux/swap.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
@@ -38,7 +39,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context*,
                                            struct page *,
                                            unsigned int, unsigned int);
 static void nfs_mark_request_dirty(struct nfs_page *req);
-static int nfs_wait_on_write_congestion(struct address_space *, int);
 static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
 static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
@@ -48,8 +48,6 @@ static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
 
-static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
-
 struct nfs_write_data *nfs_commit_alloc(void)
 {
        struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
@@ -210,6 +208,40 @@ static int wb_priority(struct writeback_control *wbc)
        return 0;
 }
 
+/*
+ * NFS congestion control
+ */
+
+int nfs_congestion_kb;
+
+#define NFS_CONGESTION_ON_THRESH       (nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH      \
+       (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static void nfs_set_page_writeback(struct page *page)
+{
+       if (!test_set_page_writeback(page)) {
+               struct inode *inode = page->mapping->host;
+               struct nfs_server *nfss = NFS_SERVER(inode);
+
+               if (atomic_inc_return(&nfss->writeback) >
+                               NFS_CONGESTION_ON_THRESH)
+                       set_bdi_congested(&nfss->backing_dev_info, WRITE);
+       }
+}
+
+static void nfs_end_page_writeback(struct page *page)
+{
+       struct inode *inode = page->mapping->host;
+       struct nfs_server *nfss = NFS_SERVER(inode);
+
+       end_page_writeback(page);
+       if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+               clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+               congestion_end(WRITE);
+       }
+}
+
 /*
  * Find an associated nfs write request, and prepare to flush it out
  * Returns 1 if there was no write request, or if the request was
@@ -247,7 +279,7 @@ static int nfs_page_mark_flush(struct page *page)
        spin_unlock(req_lock);
        if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
                nfs_mark_request_dirty(req);
-               set_page_writeback(page);
+               nfs_set_page_writeback(page);
        }
        ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
        nfs_unlock_request(req);
@@ -302,13 +334,8 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
        return err; 
 }
 
-/*
- * Note: causes nfs_update_request() to block on the assumption
- *      that the writeback is generated due to memory pressure.
- */
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-       struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct inode *inode = mapping->host;
        int err;
 
@@ -317,20 +344,12 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
        err = generic_writepages(mapping, wbc);
        if (err)
                return err;
-       while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
-               if (wbc->nonblocking)
-                       return 0;
-               nfs_wait_on_write_congestion(mapping, 0);
-       }
        err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
        if (err < 0)
                goto out;
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
        err = 0;
 out:
-       clear_bit(BDI_write_congested, &bdi->state);
-       wake_up_all(&nfs_write_congestion);
-       congestion_end(WRITE);
        return err;
 }
 
@@ -360,7 +379,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 }
 
 /*
- * Insert a write request into an inode
+ * Remove a write request from an inode
  */
 static void nfs_inode_remove_request(struct nfs_page *req)
 {
@@ -531,10 +550,10 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, un
 }
 #endif
 
-static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
+static int nfs_wait_on_write_congestion(struct address_space *mapping)
 {
+       struct inode *inode = mapping->host;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
-       DEFINE_WAIT(wait);
        int ret = 0;
 
        might_sleep();
@@ -542,31 +561,23 @@ static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
        if (!bdi_write_congested(bdi))
                return 0;
 
-       nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
+       nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
 
-       if (intr) {
-               struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
+       do {
+               struct rpc_clnt *clnt = NFS_CLIENT(inode);
                sigset_t oldset;
 
                rpc_clnt_sigmask(clnt, &oldset);
-               prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
-               if (bdi_write_congested(bdi)) {
-                       if (signalled())
-                               ret = -ERESTARTSYS;
-                       else
-                               schedule();
-               }
+               ret = congestion_wait_interruptible(WRITE, HZ/10);
                rpc_clnt_sigunmask(clnt, &oldset);
-       } else {
-               prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
-               if (bdi_write_congested(bdi))
-                       schedule();
-       }
-       finish_wait(&nfs_write_congestion, &wait);
+               if (ret == -ERESTARTSYS)
+                       break;
+               ret = 0;
+       } while (bdi_write_congested(bdi));
+
        return ret;
 }
 
-
 /*
  * Try to update any existing write request, or create one if there is none.
  * In order to match, the request's credentials must match those of
@@ -577,14 +588,15 @@ static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
 static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
                struct page *page, unsigned int offset, unsigned int bytes)
 {
-       struct inode *inode = page->mapping->host;
+       struct address_space *mapping = page->mapping;
+       struct inode *inode = mapping->host;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_page         *req, *new = NULL;
        unsigned long           rqend, end;
 
        end = offset + bytes;
 
-       if (nfs_wait_on_write_congestion(page->mapping, NFS_SERVER(inode)->flags & NFS_MOUNT_INTR))
+       if (nfs_wait_on_write_congestion(mapping))
                return ERR_PTR(-ERESTARTSYS);
        for (;;) {
                /* Loop over all inode entries and see if we find
@@ -727,7 +739,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 
 static void nfs_writepage_release(struct nfs_page *req)
 {
-       end_page_writeback(req->wb_page);
+       nfs_end_page_writeback(req->wb_page);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
        if (!PageError(req->wb_page)) {
@@ -1042,12 +1054,12 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
                if (task->tk_status < 0) {
                        nfs_set_pageerror(page);
                        req->wb_context->error = task->tk_status;
-                       end_page_writeback(page);
+                       nfs_end_page_writeback(page);
                        nfs_inode_remove_request(req);
                        dprintk(", error = %d\n", task->tk_status);
                        goto next;
                }
-               end_page_writeback(page);
+               nfs_end_page_writeback(page);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
                if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
@@ -1514,6 +1526,26 @@ int __init nfs_init_writepagecache(void)
        if (nfs_commit_mempool == NULL)
                return -ENOMEM;
 
+       /*
+        * NFS congestion size, scale with available memory.
+        *
+        *  64MB:    8192k
+        * 128MB:   11585k
+        * 256MB:   16384k
+        * 512MB:   23170k
+        *   1GB:   32768k
+        *   2GB:   46340k
+        *   4GB:   65536k
+        *   8GB:   92681k
+        *  16GB:  131072k
+        *
+        * This allows larger machines to have larger/more transfers.
+        * Limit the default to 256M
+        */
+       nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+       if (nfs_congestion_kb > 256*1024)
+               nfs_congestion_kb = 256*1024;
+
        return 0;
 }
 
index 7011d6255593dda940f859ab02e89e505d1a57ae..f2542c24b328b291c23b3cbee3a12a9f1ae45e86 100644 (file)
@@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
 void set_bdi_congested(struct backing_dev_info *bdi, int rw);
 long congestion_wait(int rw, long timeout);
+long congestion_wait_interruptible(int rw, long timeout);
 void congestion_end(int rw);
 
 #define bdi_cap_writeback_dirty(bdi) \
index 47aaa2c66738c5d574b96d9048964e7822227a00..e9ae0c6e2c62048a9410df000cac3b8350e70c69 100644 (file)
@@ -415,6 +415,7 @@ extern void nfs_complete_unlink(struct dentry *);
 /*
  * linux/fs/nfs/write.c
  */
+extern int  nfs_congestion_kb;
 extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
index 95796e6924f1d10be492771595275b7c169dde3a..c95d5e642548428cbdbff5ad0f6462f6ee75f4d8 100644 (file)
@@ -82,6 +82,7 @@ struct nfs_server {
        struct rpc_clnt *       client_acl;     /* ACL RPC client handle */
        struct nfs_iostats *    io_stats;       /* I/O statistics */
        struct backing_dev_info backing_dev_info;
+       atomic_t                writeback;      /* number of writeback pages */
        int                     flags;          /* various flags */
        unsigned int            caps;           /* server capabilities */
        unsigned int            rsize;          /* read size */
index f50a2811f9dc86b876d24e75333558583c075f1d..e5de3781d3feed5b6f156b064b4af8bb79871fed 100644 (file)
@@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeout)
 }
 EXPORT_SYMBOL(congestion_wait);
 
+long congestion_wait_interruptible(int rw, long timeout)
+{
+       long ret;
+       DEFINE_WAIT(wait);
+       wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+       prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
+       if (signal_pending(current))
+               ret = -ERESTARTSYS;
+       else
+               ret = io_schedule_timeout(timeout);
+       finish_wait(wqh, &wait);
+       return ret;
+}
+EXPORT_SYMBOL(congestion_wait_interruptible);
+
 /**
  * congestion_end - wake up sleepers on a congested backing_dev_info
  * @rw: READ or WRITE