[PATCH] Introduce sys_splice() system call
authorJens Axboe <axboe@suse.de>
Thu, 30 Mar 2006 13:15:30 +0000 (15:15 +0200)
committerLinus Torvalds <torvalds@g5.osdl.org>
Thu, 30 Mar 2006 20:28:18 +0000 (12:28 -0800)
This adds support for the sys_splice system call. Using a pipe as a
transport, it can connect to files or sockets (latter as output only).

From the splice.c comments:

   "splice": joining two ropes together by interweaving their strands.

   This is the "extended pipe" functionality, where a pipe is used as
   an arbitrary in-memory buffer. Think of a pipe as a small kernel
   buffer that you can use to transfer data from one end to the other.

   The traditional unix read/write is extended with a "splice()" operation
   that transfers data buffers to or from a pipe buffer.

   Named by Larry McVoy, original implementation from Linus, extended by
   Jens to support splicing to files and fixing the initial implementation
   bugs.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
15 files changed:
arch/i386/kernel/syscall_table.S
arch/ia64/kernel/entry.S
fs/Makefile
fs/ext2/file.c
fs/ext3/file.c
fs/pipe.c
fs/reiserfs/file.c
fs/splice.c [new file with mode: 0644]
include/asm-i386/unistd.h
include/asm-ia64/unistd.h
include/asm-powerpc/unistd.h
include/asm-x86_64/unistd.h
include/linux/fs.h
include/linux/syscalls.h
net/socket.c

index 326595f3fa4d5946d24979a72e672a5dd86f74b0..ce3ef4fa055190a8c366a64e8db8ce75fc37c9d0 100644 (file)
@@ -312,3 +312,4 @@ ENTRY(sys_call_table)
        .long sys_unshare               /* 310 */
        .long sys_set_robust_list
        .long sys_get_robust_list
+       .long sys_splice
index 0e3eda99e5494eb21187894dae2556717d37f6f0..750e8e7fbdc30a6c9f218b633a4e993c01c4038a 100644 (file)
@@ -1605,5 +1605,6 @@ sys_call_table:
        data8 sys_ni_syscall                    // reserved for pselect
        data8 sys_ni_syscall                    // 1295 reserved for ppoll
        data8 sys_unshare
+       data8 sys_splice
 
        .org sys_call_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
index 080b3867be4dc154aaec7f0eb898eb84ddf9032c..f3a4f7077175484429b9b856f57e1419e84ab57b 100644 (file)
@@ -10,7 +10,7 @@ obj-y :=      open.o read_write.o file_table.o buffer.o  bio.o super.o \
                ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
                seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-               ioprio.o pnode.o drop_caches.o
+               ioprio.o pnode.o drop_caches.o splice.o
 
 obj-$(CONFIG_INOTIFY)          += inotify.o
 obj-$(CONFIG_EPOLL)            += eventpoll.o
index 509cceca04dbf5f24fc9a0d14fcaca6d685a6d36..23e2c7ccec1d15794c2679c4215a917b8f0edb25 100644 (file)
@@ -53,6 +53,8 @@ const struct file_operations ext2_file_operations = {
        .readv          = generic_file_readv,
        .writev         = generic_file_writev,
        .sendfile       = generic_file_sendfile,
+       .splice_read    = generic_file_splice_read,
+       .splice_write   = generic_file_splice_write,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
index 783a796220bbe44f2fc124af0e9eaef6101e02dc..1efefb630ea971dae9728b32ac8beace5ad79187 100644 (file)
@@ -119,6 +119,8 @@ const struct file_operations ext3_file_operations = {
        .release        = ext3_release_file,
        .fsync          = ext3_sync_file,
        .sendfile       = generic_file_sendfile,
+       .splice_read    = generic_file_splice_read,
+       .splice_write   = generic_file_splice_write,
 };
 
 struct inode_operations ext3_file_inode_operations = {
index e2f4f1d9ffc2578ff4d2da0404c459b819a260a0..2414bf270db6cd68ff0cf086a448a5cf28a9530f 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -15,6 +15,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -94,11 +95,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
 {
        struct page *page = buf->page;
 
-       if (info->tmp_page) {
-               __free_page(page);
+       /*
+        * If nobody else uses this page, and we don't already have a
+        * temporary page, let's keep track of it as a one-deep
+        * allocation cache
+        */
+       if (page_count(page) == 1 && !info->tmp_page) {
+               info->tmp_page = page;
                return;
        }
-       info->tmp_page = page;
+
+       /*
+        * Otherwise just release our reference to it
+        */
+       page_cache_release(page);
 }
 
 static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
@@ -152,6 +162,11 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
                                chars = total_len;
 
                        addr = ops->map(filp, info, buf);
+                       if (IS_ERR(addr)) {
+                               if (!ret)
+                                       ret = PTR_ERR(addr);
+                               break;
+                       }
                        error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
                        ops->unmap(info, buf);
                        if (unlikely(error)) {
@@ -254,8 +269,16 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
                struct pipe_buf_operations *ops = buf->ops;
                int offset = buf->offset + buf->len;
                if (ops->can_merge && offset + chars <= PAGE_SIZE) {
-                       void *addr = ops->map(filp, info, buf);
-                       int error = pipe_iov_copy_from_user(offset + addr, iov, chars);
+                       void *addr;
+                       int error;
+
+                       addr = ops->map(filp, info, buf);
+                       if (IS_ERR(addr)) {
+                               error = PTR_ERR(addr);
+                               goto out;
+                       }
+                       error = pipe_iov_copy_from_user(offset + addr, iov,
+                                                       chars);
                        ops->unmap(info, buf);
                        ret = error;
                        do_wakeup = 1;
index 010094d14da678fce192938a77e3e903a245bd09..cf6e1cf4035154cbad95536dee19944eff81812f 100644 (file)
@@ -1576,6 +1576,8 @@ const struct file_operations reiserfs_file_operations = {
        .sendfile = generic_file_sendfile,
        .aio_read = generic_file_aio_read,
        .aio_write = reiserfs_aio_write,
+       .splice_read = generic_file_splice_read,
+       .splice_write = generic_file_splice_write,
 };
 
 struct inode_operations reiserfs_file_inode_operations = {
diff --git a/fs/splice.c b/fs/splice.c
new file mode 100644 (file)
index 0000000..efa47c1
--- /dev/null
@@ -0,0 +1,612 @@
+/*
+ * "splice": joining two ropes together by interweaving their strands.
+ *
+ * This is the "extended pipe" functionality, where a pipe is used as
+ * an arbitrary in-memory buffer. Think of a pipe as a small kernel
+ * buffer that you can use to transfer data from one end to the other.
+ *
+ * The traditional unix read/write is extended with a "splice()" operation
+ * that transfers data buffers to or from a pipe buffer.
+ *
+ * Named by Larry McVoy, original implementation from Linus, extended by
+ * Jens to support splicing to files and fixing the initial implementation
+ * bugs.
+ *
+ * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
+ *
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mm_inline.h>
+
+/*
+ * Passed to the actors
+ */
+struct splice_desc {
+       unsigned int len, total_len;    /* current and remaining length */
+       unsigned int flags;             /* splice flags */
+       struct file *file;              /* file to read/write */
+       loff_t pos;                     /* file position */
+};
+
+static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
+                                       struct pipe_buffer *buf)
+{
+       page_cache_release(buf->page);
+       buf->page = NULL;
+}
+
+static void *page_cache_pipe_buf_map(struct file *file,
+                                    struct pipe_inode_info *info,
+                                    struct pipe_buffer *buf)
+{
+       struct page *page = buf->page;
+
+       lock_page(page);
+
+       if (!PageUptodate(page)) {
+               unlock_page(page);
+               return ERR_PTR(-EIO);
+       }
+
+       if (!page->mapping) {
+               unlock_page(page);
+               return ERR_PTR(-ENODATA);
+       }
+
+       return kmap(buf->page);
+}
+
+static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
+                                     struct pipe_buffer *buf)
+{
+       unlock_page(buf->page);
+       kunmap(buf->page);
+}
+
+static struct pipe_buf_operations page_cache_pipe_buf_ops = {
+       .can_merge = 0,
+       .map = page_cache_pipe_buf_map,
+       .unmap = page_cache_pipe_buf_unmap,
+       .release = page_cache_pipe_buf_release,
+};
+
+static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
+                           int nr_pages, unsigned long offset,
+                           unsigned long len)
+{
+       struct pipe_inode_info *info;
+       int ret, do_wakeup, i;
+
+       ret = 0;
+       do_wakeup = 0;
+       i = 0;
+
+       mutex_lock(PIPE_MUTEX(*inode));
+
+       info = inode->i_pipe;
+       for (;;) {
+               int bufs;
+
+               if (!PIPE_READERS(*inode)) {
+                       send_sig(SIGPIPE, current, 0);
+                       if (!ret)
+                               ret = -EPIPE;
+                       break;
+               }
+
+               bufs = info->nrbufs;
+               if (bufs < PIPE_BUFFERS) {
+                       int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
+                       struct pipe_buffer *buf = info->bufs + newbuf;
+                       struct page *page = pages[i++];
+                       unsigned long this_len;
+
+                       this_len = PAGE_CACHE_SIZE - offset;
+                       if (this_len > len)
+                               this_len = len;
+
+                       buf->page = page;
+                       buf->offset = offset;
+                       buf->len = this_len;
+                       buf->ops = &page_cache_pipe_buf_ops;
+                       info->nrbufs = ++bufs;
+                       do_wakeup = 1;
+
+                       ret += this_len;
+                       len -= this_len;
+                       offset = 0;
+                       if (!--nr_pages)
+                               break;
+                       if (!len)
+                               break;
+                       if (bufs < PIPE_BUFFERS)
+                               continue;
+
+                       break;
+               }
+
+               if (signal_pending(current)) {
+                       if (!ret)
+                               ret = -ERESTARTSYS;
+                       break;
+               }
+
+               if (do_wakeup) {
+                       wake_up_interruptible_sync(PIPE_WAIT(*inode));
+                       kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
+                                   POLL_IN);
+                       do_wakeup = 0;
+               }
+
+               PIPE_WAITING_WRITERS(*inode)++;
+               pipe_wait(inode);
+               PIPE_WAITING_WRITERS(*inode)--;
+       }
+
+       mutex_unlock(PIPE_MUTEX(*inode));
+
+       if (do_wakeup) {
+               wake_up_interruptible(PIPE_WAIT(*inode));
+               kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+       }
+
+       while (i < nr_pages)
+               page_cache_release(pages[i++]);
+
+       return ret;
+}
+
+static int __generic_file_splice_read(struct file *in, struct inode *pipe,
+                                     size_t len)
+{
+       struct address_space *mapping = in->f_mapping;
+       unsigned int offset, nr_pages;
+       struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
+       struct page *page;
+       pgoff_t index, pidx;
+       int i, j;
+
+       index = in->f_pos >> PAGE_CACHE_SHIFT;
+       offset = in->f_pos & ~PAGE_CACHE_MASK;
+       nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+       if (nr_pages > PIPE_BUFFERS)
+               nr_pages = PIPE_BUFFERS;
+
+       /*
+        * initiate read-ahead on this page range
+        */
+       do_page_cache_readahead(mapping, in, index, nr_pages);
+
+       /*
+        * Get as many pages from the page cache as possible..
+        * Start IO on the page cache entries we create (we
+        * can assume that any pre-existing ones we find have
+        * already had IO started on them).
+        */
+       i = find_get_pages(mapping, index, nr_pages, pages);
+
+       /*
+        * common case - we found all pages and they are contiguous,
+        * kick them off
+        */
+       if (i && (pages[i - 1]->index == index + i - 1))
+               goto splice_them;
+
+       /*
+        * fill shadow[] with pages at the right locations, so we only
+        * have to fill holes
+        */
+       memset(shadow, 0, i * sizeof(struct page *));
+       for (j = 0, pidx = index; j < i; pidx++, j++)
+               shadow[pages[j]->index - pidx] = pages[j];
+
+       /*
+        * now fill in the holes
+        */
+       for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
+               int error;
+
+               if (shadow[i])
+                       continue;
+
+               /*
+                * no page there, look one up / create it
+                */
+               page = find_or_create_page(mapping, pidx,
+                                                  mapping_gfp_mask(mapping));
+               if (!page)
+                       break;
+
+               if (PageUptodate(page))
+                       unlock_page(page);
+               else {
+                       error = mapping->a_ops->readpage(in, page);
+
+                       if (unlikely(error)) {
+                               page_cache_release(page);
+                               break;
+                       }
+               }
+               shadow[i] = page;
+       }
+
+       if (!i) {
+               for (i = 0; i < nr_pages; i++) {
+                        if (shadow[i])
+                               page_cache_release(shadow[i]);
+               }
+               return 0;
+       }
+
+       memcpy(pages, shadow, i * sizeof(struct page *));
+
+       /*
+        * Now we splice them into the pipe..
+        */
+splice_them:
+       return move_to_pipe(pipe, pages, i, offset, len);
+}
+
+ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
+                                size_t len, unsigned int flags)
+{
+       ssize_t spliced;
+       int ret;
+
+       ret = 0;
+       spliced = 0;
+       while (len) {
+               ret = __generic_file_splice_read(in, pipe, len);
+
+               if (ret <= 0)
+                       break;
+
+               in->f_pos += ret;
+               len -= ret;
+               spliced += ret;
+       }
+
+       if (spliced)
+               return spliced;
+
+       return ret;
+}
+
+/*
+ * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
+ */
+static int pipe_to_sendpage(struct pipe_inode_info *info,
+                           struct pipe_buffer *buf, struct splice_desc *sd)
+{
+       struct file *file = sd->file;
+       loff_t pos = sd->pos;
+       unsigned int offset;
+       ssize_t ret;
+       void *ptr;
+
+       /*
+        * sub-optimal, but we are limited by the pipe ->map. we don't
+        * need a kmap'ed buffer here, we just want to make sure we
+        * have the page pinned if the pipe page originates from the
+        * page cache
+        */
+       ptr = buf->ops->map(file, info, buf);
+       if (IS_ERR(ptr))
+               return PTR_ERR(ptr);
+
+       offset = pos & ~PAGE_CACHE_MASK;
+
+       ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
+                                       sd->len < sd->total_len);
+
+       buf->ops->unmap(info, buf);
+       if (ret == sd->len)
+               return 0;
+
+       return -EIO;
+}
+
+/*
+ * This is a little more tricky than the file -> pipe splicing. There are
+ * basically three cases:
+ *
+ *     - Destination page already exists in the address space and there
+ *       are users of it. For that case we have no other option that
+ *       copying the data. Tough luck.
+ *     - Destination page already exists in the address space, but there
+ *       are no users of it. Make sure it's uptodate, then drop it. Fall
+ *       through to last case.
+ *     - Destination page does not exist, we can add the pipe page to
+ *       the page cache and avoid the copy.
+ *
+ * For now we just do the slower thing and always copy pages over, it's
+ * easier than migrating pages from the pipe to the target file. For the
+ * case of doing file | file splicing, the migrate approach had some LRU
+ * nastiness...
+ */
+static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
+                       struct splice_desc *sd)
+{
+       struct file *file = sd->file;
+       struct address_space *mapping = file->f_mapping;
+       unsigned int offset;
+       struct page *page;
+       char *src, *dst;
+       pgoff_t index;
+       int ret;
+
+       /*
+        * after this, page will be locked and unmapped
+        */
+       src = buf->ops->map(file, info, buf);
+       if (IS_ERR(src))
+               return PTR_ERR(src);
+
+       index = sd->pos >> PAGE_CACHE_SHIFT;
+       offset = sd->pos & ~PAGE_CACHE_MASK;
+
+find_page:
+       ret = -ENOMEM;
+       page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
+       if (!page)
+               goto out;
+
+       /*
+        * If the page is uptodate, it is also locked. If it isn't
+        * uptodate, we can mark it uptodate if we are filling the
+        * full page. Otherwise we need to read it in first...
+        */
+       if (!PageUptodate(page)) {
+               if (sd->len < PAGE_CACHE_SIZE) {
+                       ret = mapping->a_ops->readpage(file, page);
+                       if (unlikely(ret))
+                               goto out;
+
+                       lock_page(page);
+
+                       if (!PageUptodate(page)) {
+                               /*
+                                * page got invalidated, repeat
+                                */
+                               if (!page->mapping) {
+                                       unlock_page(page);
+                                       page_cache_release(page);
+                                       goto find_page;
+                               }
+                               ret = -EIO;
+                               goto out;
+                       }
+               } else {
+                       WARN_ON(!PageLocked(page));
+                       SetPageUptodate(page);
+               }
+       }
+
+       ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
+       if (ret)
+               goto out;
+
+       dst = kmap_atomic(page, KM_USER0);
+       memcpy(dst + offset, src + buf->offset, sd->len);
+       flush_dcache_page(page);
+       kunmap_atomic(dst, KM_USER0);
+
+       ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
+       if (ret < 0)
+               goto out;
+
+       set_page_dirty(page);
+       ret = write_one_page(page, 0);
+out:
+       if (ret < 0)
+               unlock_page(page);
+       page_cache_release(page);
+       buf->ops->unmap(info, buf);
+       return ret;
+}
+
+typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
+                          struct splice_desc *);
+
+static ssize_t move_from_pipe(struct inode *inode, struct file *out,
+                             size_t len, unsigned int flags,
+                             splice_actor *actor)
+{
+       struct pipe_inode_info *info;
+       int ret, do_wakeup, err;
+       struct splice_desc sd;
+
+       ret = 0;
+       do_wakeup = 0;
+
+       sd.total_len = len;
+       sd.flags = flags;
+       sd.file = out;
+       sd.pos = out->f_pos;
+
+       mutex_lock(PIPE_MUTEX(*inode));
+
+       info = inode->i_pipe;
+       for (;;) {
+               int bufs = info->nrbufs;
+
+               if (bufs) {
+                       int curbuf = info->curbuf;
+                       struct pipe_buffer *buf = info->bufs + curbuf;
+                       struct pipe_buf_operations *ops = buf->ops;
+
+                       sd.len = buf->len;
+                       if (sd.len > sd.total_len)
+                               sd.len = sd.total_len;
+
+                       err = actor(info, buf, &sd);
+                       if (err) {
+                               if (!ret && err != -ENODATA)
+                                       ret = err;
+
+                               break;
+                       }
+
+                       ret += sd.len;
+                       buf->offset += sd.len;
+                       buf->len -= sd.len;
+                       if (!buf->len) {
+                               buf->ops = NULL;
+                               ops->release(info, buf);
+                               curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
+                               info->curbuf = curbuf;
+                               info->nrbufs = --bufs;
+                               do_wakeup = 1;
+                       }
+
+                       sd.pos += sd.len;
+                       sd.total_len -= sd.len;
+                       if (!sd.total_len)
+                               break;
+               }
+
+               if (bufs)
+                       continue;
+               if (!PIPE_WRITERS(*inode))
+                       break;
+               if (!PIPE_WAITING_WRITERS(*inode)) {
+                       if (ret)
+                               break;
+               }
+
+               if (signal_pending(current)) {
+                       if (!ret)
+                               ret = -ERESTARTSYS;
+                       break;
+               }
+
+               if (do_wakeup) {
+                       wake_up_interruptible_sync(PIPE_WAIT(*inode));
+                       kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
+                       do_wakeup = 0;
+               }
+
+               pipe_wait(inode);
+       }
+
+       mutex_unlock(PIPE_MUTEX(*inode));
+
+       if (do_wakeup) {
+               wake_up_interruptible(PIPE_WAIT(*inode));
+               kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+       }
+
+       mutex_lock(&out->f_mapping->host->i_mutex);
+       out->f_pos = sd.pos;
+       mutex_unlock(&out->f_mapping->host->i_mutex);
+       return ret;
+
+}
+
+ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
+                                 size_t len, unsigned int flags)
+{
+       return move_from_pipe(inode, out, len, flags, pipe_to_file);
+}
+
+ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+                               size_t len, unsigned int flags)
+{
+       return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
+}
+
+static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
+                          unsigned int flags)
+{
+       loff_t pos;
+       int ret;
+
+       if (!out->f_op || !out->f_op->splice_write)
+               return -EINVAL;
+
+       if (!(out->f_mode & FMODE_WRITE))
+               return -EBADF;
+
+       pos = out->f_pos;
+       ret = rw_verify_area(WRITE, out, &pos, len);
+       if (unlikely(ret < 0))
+               return ret;
+
+       return out->f_op->splice_write(pipe, out, len, flags);
+}
+
+static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
+                        unsigned int flags)
+{
+       loff_t pos, isize, left;
+       int ret;
+
+       if (!in->f_op || !in->f_op->splice_read)
+               return -EINVAL;
+
+       if (!(in->f_mode & FMODE_READ))
+               return -EBADF;
+
+       pos = in->f_pos;
+       ret = rw_verify_area(READ, in, &pos, len);
+       if (unlikely(ret < 0))
+               return ret;
+
+       isize = i_size_read(in->f_mapping->host);
+       if (unlikely(in->f_pos >= isize))
+               return 0;
+       
+       left = isize - in->f_pos;
+       if (left < len)
+               len = left;
+
+       return in->f_op->splice_read(in, pipe, len, flags);
+}
+
+static long do_splice(struct file *in, struct file *out, size_t len,
+                     unsigned int flags)
+{
+       struct inode *pipe;
+
+       pipe = in->f_dentry->d_inode;
+       if (pipe->i_pipe)
+               return do_splice_from(pipe, out, len, flags);
+
+       pipe = out->f_dentry->d_inode;
+       if (pipe->i_pipe)
+               return do_splice_to(in, pipe, len, flags);
+
+       return -EINVAL;
+}
+
+asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
+{
+       long error;
+       struct file *in, *out;
+       int fput_in, fput_out;
+
+       if (unlikely(!len))
+               return 0;
+
+       error = -EBADF;
+       in = fget_light(fdin, &fput_in);
+       if (in) {
+               if (in->f_mode & FMODE_READ) {
+                       out = fget_light(fdout, &fput_out);
+                       if (out) {
+                               if (out->f_mode & FMODE_WRITE)
+                                       error = do_splice(in, out, len, flags);
+                               fput_light(out, fput_out);
+                       }
+               }
+
+               fput_light(in, fput_in);
+       }
+
+       return error;
+}
index 014e3562895b1312608e39fc8e3478a52d0ecd53..789e9bdd0a406188335d8da57e0625e7d84f6663 100644 (file)
 #define __NR_unshare           310
 #define __NR_set_robust_list   311
 #define __NR_get_robust_list   312
+#define __NR_sys_splice                313
 
-#define NR_syscalls 313
+#define NR_syscalls 314
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
index 019956c613e4814d9341a912b20457b029470e0b..36070c1014d885397d92897cdfb97decf3c671a2 100644 (file)
 #define __NR_faccessat                 1293
 /* 1294, 1295 reserved for pselect/ppoll */
 #define __NR_unshare                   1296
+#define __NR_splice                    1297
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls                    273 /* length of syscall table */
+#define NR_syscalls                    274 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
index 1e990747dce7d2957a7212dbf2561cfec8d39810..536ba0873052c92be7e314446f69b1ae8b5b2e0c 100644 (file)
 #define __NR_pselect6          280
 #define __NR_ppoll             281
 #define __NR_unshare           282
+#define __NR_splice            283
 
-#define __NR_syscalls          283
+#define __NR_syscalls          284
 
 #ifdef __KERNEL__
 #define __NR__exit __NR_exit
index fcc5163530873bf7437d8509231db03a2165501d..f21ff2c1e960bfcafeb735b058857c14f0529a2e 100644 (file)
@@ -609,8 +609,10 @@ __SYSCALL(__NR_unshare,    sys_unshare)
 __SYSCALL(__NR_set_robust_list, sys_set_robust_list)
 #define __NR_get_robust_list   274
 __SYSCALL(__NR_get_robust_list, sys_get_robust_list)
+#define __NR_splice            275
+__SYSCALL(__NR_splice, sys_splice)
 
-#define __NR_syscall_max __NR_get_robust_list
+#define __NR_syscall_max __NR_splice
 
 #ifndef __NO_STUBS
 
index 408fe89498f421285b563e12889439b3e620b310..20fa5f6d7269f8ccfed3bafc93eda805959d7ce8 100644 (file)
@@ -1032,6 +1032,8 @@ struct file_operations {
        int (*check_flags)(int);
        int (*dir_notify)(struct file *filp, unsigned long arg);
        int (*flock) (struct file *, int, struct file_lock *);
+       ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int);
+       ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int);
 };
 
 struct inode_operations {
@@ -1609,6 +1611,8 @@ extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor
 extern void do_generic_mapping_read(struct address_space *mapping,
                                    struct file_ra_state *, struct file *,
                                    loff_t *, read_descriptor_t *, read_actor_t);
+extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int);
+extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int);
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
index e487e3b60f605add9318df0202051964468c7304..e78ffc7d5b5613d5bf6270c3942f5388c0b636bd 100644 (file)
@@ -569,5 +569,7 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
 asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
                                   int flags, int mode);
 asmlinkage long sys_unshare(unsigned long unshare_flags);
+asmlinkage long sys_splice(int fdin, int fdout, size_t len,
+                               unsigned int flags);
 
 #endif
index fcd77eac0ccff184294eab4888e3ff61ad35d168..b13042f68c022e9ef6e8647ad35055e39b5f52ef 100644 (file)
@@ -119,6 +119,9 @@ static ssize_t sock_writev(struct file *file, const struct iovec *vector,
 static ssize_t sock_sendpage(struct file *file, struct page *page,
                             int offset, size_t size, loff_t *ppos, int more);
 
+extern ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+                               size_t len, unsigned int flags);
+
 
 /*
  *     Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
@@ -141,7 +144,8 @@ static struct file_operations socket_file_ops = {
        .fasync =       sock_fasync,
        .readv =        sock_readv,
        .writev =       sock_writev,
-       .sendpage =     sock_sendpage
+       .sendpage =     sock_sendpage,
+       .splice_write = generic_splice_sendpage,
 };
 
 /*