nfs: Use UNSTABLE + COMMIT for NFS O_DIRECT writes
authorTrond Myklebust <Trond.Myklebust@netapp.com>
Mon, 20 Mar 2006 18:44:36 +0000 (13:44 -0500)
committerTrond Myklebust <Trond.Myklebust@netapp.com>
Mon, 20 Mar 2006 18:44:36 +0000 (13:44 -0500)
Currently NFS O_DIRECT writes use FILE_SYNC so that a COMMIT is not
necessary.  This simplifies the internal logic, but this could be a
difficult workload for some servers.

Instead, let's send UNSTABLE writes, and after they all complete, send a
COMMIT for the dirty range.  After the COMMIT returns successfully, then do
the wake_up or fire off aio_complete().

Test plan:
Async direct I/O tests against Solaris (or any server that requires
committed unstable writes).  Reboot server during test.

Based on an earlier patch by Chuck Lever <cel@netapp.com>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
fs/nfs/direct.c
include/linux/nfs_fs.h

index 737990dd4dfe257510239b09a2d78acc17cbb398..f0f2053c7a61939a72b2e445b095106536b93357 100644 (file)
@@ -69,11 +69,15 @@ struct nfs_direct_req {
        struct kref             kref;           /* release manager */
 
        /* I/O parameters */
-       struct list_head        list;           /* nfs_read/write_data structs */
+       struct list_head        list,           /* nfs_read/write_data structs */
+                               rewrite_list;   /* saved nfs_write_data structs */
        struct file *           filp;           /* file descriptor */
        struct kiocb *          iocb;           /* controlling i/o request */
        wait_queue_head_t       wait;           /* wait for i/o completion */
        struct inode *          inode;          /* target file of i/o */
+       unsigned long           user_addr;      /* location of user's buffer */
+       size_t                  user_count;     /* total bytes to move */
+       loff_t                  pos;            /* starting offset in file */
        struct page **          pages;          /* pages in our buffer */
        unsigned int            npages;         /* count of pages */
 
@@ -82,8 +86,18 @@ struct nfs_direct_req {
        int                     outstanding;    /* i/os we're waiting for */
        ssize_t                 count,          /* bytes actually processed */
                                error;          /* any reported error */
+
+       /* commit state */
+       struct nfs_write_data * commit_data;    /* special write_data for commits */
+       int                     flags;
+#define NFS_ODIRECT_DO_COMMIT          (1)     /* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES     (2)     /* write verification failed */
+       struct nfs_writeverf    verf;           /* unstable write verifier */
 };
 
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -160,11 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
        kref_init(&dreq->kref);
        init_waitqueue_head(&dreq->wait);
        INIT_LIST_HEAD(&dreq->list);
+       INIT_LIST_HEAD(&dreq->rewrite_list);
        dreq->iocb = NULL;
        spin_lock_init(&dreq->lock);
        dreq->outstanding = 0;
        dreq->count = 0;
        dreq->error = 0;
+       dreq->flags = 0;
 
        return dreq;
 }
@@ -299,7 +315,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
  * For each nfs_read_data struct that was allocated on the list, dispatch
  * an NFS READ operation
  */
-static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
 {
        struct file *file = dreq->filp;
        struct inode *inode = file->f_mapping->host;
@@ -307,11 +323,13 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
                                                        file->private_data;
        struct list_head *list = &dreq->list;
        struct page **pages = dreq->pages;
+       size_t count = dreq->user_count;
+       loff_t pos = dreq->pos;
        size_t rsize = NFS_SERVER(inode)->rsize;
        unsigned int curpage, pgbase;
 
        curpage = 0;
-       pgbase = user_addr & ~PAGE_MASK;
+       pgbase = dreq->user_addr & ~PAGE_MASK;
        do {
                struct nfs_read_data *data;
                size_t bytes;
@@ -373,6 +391,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
        if (!dreq)
                return -ENOMEM;
 
+       dreq->user_addr = user_addr;
+       dreq->user_count = count;
+       dreq->pos = pos;
        dreq->pages = pages;
        dreq->npages = nr_pages;
        igrab(inode);
@@ -383,13 +404,137 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
        nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
        rpc_clnt_sigmask(clnt, &oldset);
-       nfs_direct_read_schedule(dreq, user_addr, count, pos);
+       nfs_direct_read_schedule(dreq);
        result = nfs_direct_wait(dreq);
        rpc_clnt_sigunmask(clnt, &oldset);
 
        return result;
 }
 
+static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
+{
+       list_splice_init(&dreq->rewrite_list, &dreq->list);
+       while (!list_empty(&dreq->list)) {
+               struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
+               list_del(&data->pages);
+               nfs_writedata_release(data);
+       }
+}
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+{
+       struct list_head *pos;
+
+       list_splice_init(&dreq->rewrite_list, &dreq->list);
+       list_for_each(pos, &dreq->list)
+               dreq->outstanding++;
+       dreq->count = 0;
+
+       nfs_direct_write_schedule(dreq, FLUSH_STABLE);
+}
+
+static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
+{
+       struct nfs_write_data *data = calldata;
+       struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+
+       /* Call the NFS version-specific code */
+       if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
+               return;
+       if (unlikely(task->tk_status < 0)) {
+               dreq->error = task->tk_status;
+               dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+       }
+       if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+               dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
+               dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+       }
+
+       dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
+       nfs_direct_write_complete(dreq, data->inode);
+}
+
+static const struct rpc_call_ops nfs_commit_direct_ops = {
+       .rpc_call_done = nfs_direct_commit_result,
+       .rpc_release = nfs_commit_release,
+};
+
+static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+{
+       struct file *file = dreq->filp;
+       struct nfs_open_context *ctx = (struct nfs_open_context *)
+                                                       file->private_data;
+       struct nfs_write_data *data = dreq->commit_data;
+       struct rpc_task *task = &data->task;
+
+       data->inode = dreq->inode;
+       data->cred = ctx->cred;
+
+       data->args.fh = NFS_FH(data->inode);
+       data->args.offset = dreq->pos;
+       data->args.count = dreq->user_count;
+       data->res.count = 0;
+       data->res.fattr = &data->fattr;
+       data->res.verf = &data->verf;
+
+       rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
+                               &nfs_commit_direct_ops, data);
+       NFS_PROTO(data->inode)->commit_setup(data, 0);
+
+       data->task.tk_priority = RPC_PRIORITY_NORMAL;
+       data->task.tk_cookie = (unsigned long)data->inode;
+       /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+       dreq->commit_data = NULL;
+
+       dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
+
+       lock_kernel();
+       rpc_execute(&data->task);
+       unlock_kernel();
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+       int flags = dreq->flags;
+
+       dreq->flags = 0;
+       switch (flags) {
+               case NFS_ODIRECT_DO_COMMIT:
+                       nfs_direct_commit_schedule(dreq);
+                       break;
+               case NFS_ODIRECT_RESCHED_WRITES:
+                       nfs_direct_write_reschedule(dreq);
+                       break;
+               default:
+                       nfs_end_data_update(inode);
+                       if (dreq->commit_data != NULL)
+                               nfs_commit_free(dreq->commit_data);
+                       nfs_direct_free_writedata(dreq);
+                       nfs_direct_complete(dreq);
+       }
+}
+
+static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+       dreq->commit_data = nfs_commit_alloc(0);
+       if (dreq->commit_data != NULL)
+               dreq->commit_data->req = (struct nfs_page *) dreq;
+}
+#else
+static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+       dreq->commit_data = NULL;
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+       nfs_end_data_update(inode);
+       nfs_direct_free_writedata(dreq);
+       nfs_direct_complete(dreq);
+}
+#endif
+
 static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
 {
        struct list_head *list;
@@ -424,14 +569,13 @@ static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize
                        break;
                nbytes -= wsize;
        }
+
+       nfs_alloc_commit_data(dreq);
+
        kref_get(&dreq->kref);
        return dreq;
 }
 
-/*
- * NB: Return the value of the first error return code.  Subsequent
- *     errors after the first one are ignored.
- */
 static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data *data = calldata;
@@ -440,41 +584,62 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 
        if (nfs_writeback_done(task, data) != 0)
                return;
-       /* If the server fell back to an UNSTABLE write, it's an error. */
-       if (unlikely(data->res.verf->committed != NFS_FILE_SYNC))
-               status = -EIO;
 
        spin_lock(&dreq->lock);
 
        if (likely(status >= 0))
                dreq->count += data->res.count;
        else
-               dreq->error = status;
+               dreq->error = task->tk_status;
 
+       if (data->res.verf->committed != NFS_FILE_SYNC) {
+               switch (dreq->flags) {
+                       case 0:
+                               memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
+                               dreq->flags = NFS_ODIRECT_DO_COMMIT;
+                               break;
+                       case NFS_ODIRECT_DO_COMMIT:
+                               if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
+                                       dprintk("NFS: %5u write verify failed\n", task->tk_pid);
+                                       dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+                               }
+               }
+       }
+       /* In case we have to resend */
+       data->args.stable = NFS_FILE_SYNC;
+
+       spin_unlock(&dreq->lock);
+}
+
+/*
+ * NB: Return the value of the first error return code.  Subsequent
+ *     errors after the first one are ignored.
+ */
+static void nfs_direct_write_release(void *calldata)
+{
+       struct nfs_write_data *data = calldata;
+       struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+
+       spin_lock(&dreq->lock);
        if (--dreq->outstanding) {
                spin_unlock(&dreq->lock);
                return;
        }
-
        spin_unlock(&dreq->lock);
 
-       nfs_end_data_update(data->inode);
-       nfs_direct_complete(dreq);
+       nfs_direct_write_complete(dreq, data->inode);
 }
 
 static const struct rpc_call_ops nfs_write_direct_ops = {
        .rpc_call_done = nfs_direct_write_result,
-       .rpc_release = nfs_writedata_release,
+       .rpc_release = nfs_direct_write_release,
 };
 
 /*
  * For each nfs_write_data struct that was allocated on the list, dispatch
  * an NFS WRITE operation
- *
- * XXX: For now, support only FILE_SYNC writes.  Later we may add
- *      support for UNSTABLE + COMMIT.
  */
-static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
 {
        struct file *file = dreq->filp;
        struct inode *inode = file->f_mapping->host;
@@ -482,11 +647,13 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
                                                        file->private_data;
        struct list_head *list = &dreq->list;
        struct page **pages = dreq->pages;
+       size_t count = dreq->user_count;
+       loff_t pos = dreq->pos;
        size_t wsize = NFS_SERVER(inode)->wsize;
        unsigned int curpage, pgbase;
 
        curpage = 0;
-       pgbase = user_addr & ~PAGE_MASK;
+       pgbase = dreq->user_addr & ~PAGE_MASK;
        do {
                struct nfs_write_data *data;
                size_t bytes;
@@ -496,7 +663,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
                        bytes = count;
 
                data = list_entry(list->next, struct nfs_write_data, pages);
-               list_del_init(&data->pages);
+               list_move_tail(&data->pages, &dreq->rewrite_list);
 
                data->inode = inode;
                data->cred = ctx->cred;
@@ -512,7 +679,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 
                rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
                                &nfs_write_direct_ops, data);
-               NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
+               NFS_PROTO(inode)->write_setup(data, sync);
 
                data->task.tk_priority = RPC_PRIORITY_NORMAL;
                data->task.tk_cookie = (unsigned long) inode;
@@ -544,11 +711,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct rpc_clnt *clnt = NFS_CLIENT(inode);
        struct nfs_direct_req *dreq;
+       size_t wsize = NFS_SERVER(inode)->wsize;
+       int sync = 0;
 
-       dreq = nfs_direct_write_alloc(count, NFS_SERVER(inode)->wsize);
+       dreq = nfs_direct_write_alloc(count, wsize);
        if (!dreq)
                return -ENOMEM;
+       if (dreq->commit_data == NULL || count < wsize)
+               sync = FLUSH_STABLE;
 
+       dreq->user_addr = user_addr;
+       dreq->user_count = count;
+       dreq->pos = pos;
        dreq->pages = pages;
        dreq->npages = nr_pages;
        igrab(inode);
@@ -562,7 +736,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
        nfs_begin_data_update(inode);
 
        rpc_clnt_sigmask(clnt, &oldset);
-       nfs_direct_write_schedule(dreq, user_addr, count, pos);
+       nfs_direct_write_schedule(dreq, sync);
        result = nfs_direct_wait(dreq);
        rpc_clnt_sigunmask(clnt, &oldset);
 
index 423f202b881c4f70b6eb7be9157115a06c0ecaf1..9f84c8a5ea4321ee0c05ec5c4af1b6611f9026e4 100644 (file)
@@ -422,6 +422,7 @@ void nfs_commit_free(struct nfs_write_data *p);
 extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
+extern void nfs_commit_release(void *wdata);
 #else
 static inline int
 nfs_commit_inode(struct inode *inode, int how)