struct mm_struct *mm;
};
+struct userfaultfd_fork_ctx {
+ struct userfaultfd_ctx *orig;
+ struct userfaultfd_ctx *new;
+ struct list_head list;
+};
+
struct userfaultfd_wait_queue {
struct uffd_msg msg;
wait_queue_t wq;
return ret;
}
-static int __maybe_unused userfaultfd_event_wait_completion(
- struct userfaultfd_ctx *ctx,
- struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *ewq)
{
int ret = 0;
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
}
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+ struct userfaultfd_ctx *ctx = NULL, *octx;
+ struct userfaultfd_fork_ctx *fctx;
+
+ octx = vma->vm_userfaultfd_ctx.ctx;
+ if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+ return 0;
+ }
+
+ list_for_each_entry(fctx, fcs, list)
+ if (fctx->orig == octx) {
+ ctx = fctx->new;
+ break;
+ }
+
+ if (!ctx) {
+ fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+ if (!fctx)
+ return -ENOMEM;
+
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx) {
+ kfree(fctx);
+ return -ENOMEM;
+ }
+
+ atomic_set(&ctx->refcount, 1);
+ ctx->flags = octx->flags;
+ ctx->state = UFFD_STATE_RUNNING;
+ ctx->features = octx->features;
+ ctx->released = false;
+ ctx->mm = vma->vm_mm;
+ atomic_inc(&ctx->mm->mm_users);
+
+ userfaultfd_ctx_get(octx);
+ fctx->orig = octx;
+ fctx->new = ctx;
+ list_add_tail(&fctx->list, fcs);
+ }
+
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+ return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+ struct userfaultfd_ctx *ctx = fctx->orig;
+ struct userfaultfd_wait_queue ewq;
+
+ msg_init(&ewq.msg);
+
+ ewq.msg.event = UFFD_EVENT_FORK;
+ ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+ return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+ int ret = 0;
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ if (!ret)
+ ret = dup_fctx(fctx);
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
}
}
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_ctx *new,
+ struct uffd_msg *msg)
+{
+ int fd;
+ struct file *file;
+ unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+ fd = get_unused_fd_flags(flags);
+ if (fd < 0)
+ return fd;
+
+ file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+ O_RDWR | flags);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ return PTR_ERR(file);
+ }
+
+ fd_install(fd, file);
+ msg->arg.reserved.reserved1 = 0;
+ msg->arg.fork.ufd = fd;
+
+ return 0;
+}
+
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
struct uffd_msg *msg)
{
ssize_t ret;
DECLARE_WAITQUEUE(wait, current);
struct userfaultfd_wait_queue *uwq;
+ /*
+ * Handling fork event requires sleeping operations, so
+ * we drop the event_wqh lock, then do these ops, then
+ * lock it back and wake up the waiter. While the lock is
+ * dropped the ewq may go away so we keep track of it
+ * carefully.
+ */
+ LIST_HEAD(fork_event);
+ struct userfaultfd_ctx *fork_nctx = NULL;
/* always take the fd_wqh lock before the fault_pending_wqh lock */
spin_lock(&ctx->fd_wqh.lock);
if (uwq) {
*msg = uwq->msg;
+ if (uwq->msg.event == UFFD_EVENT_FORK) {
+ fork_nctx = (struct userfaultfd_ctx *)
+ (unsigned long)
+ uwq->msg.arg.reserved.reserved1;
+ list_move(&uwq->wq.task_list, &fork_event);
+ spin_unlock(&ctx->event_wqh.lock);
+ ret = 0;
+ break;
+ }
+
userfaultfd_event_complete(ctx, uwq);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
__set_current_state(TASK_RUNNING);
spin_unlock(&ctx->fd_wqh.lock);
+ if (!ret && msg->event == UFFD_EVENT_FORK) {
+ ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+ if (!ret) {
+ spin_lock(&ctx->event_wqh.lock);
+ if (!list_empty(&fork_event)) {
+ uwq = list_first_entry(&fork_event,
+ typeof(*uwq),
+ wq.task_list);
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+ userfaultfd_event_complete(ctx, uwq);
+ }
+ spin_unlock(&ctx->event_wqh.lock);
+ }
+ }
+
return ret;
}
* means the userland is reading).
*/
#define UFFD_API ((__u64)0xAA)
-/*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- * UFFD_FEATURE_EVENT_FORK)
- */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
__u64 address;
} pagefault;
+ struct {
+ __u32 ufd;
+ } fork;
+
struct {
/* unused reserved fields */
__u64 reserved1;
* Start at 0x12 and not at 0 to be more strict against bugs.
*/
#define UFFD_EVENT_PAGEFAULT 0x12
-#if 0 /* not available yet */
#define UFFD_EVENT_FORK 0x13
-#endif
/* flags for UFFD_EVENT_PAGEFAULT */
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
* are to be considered implicitly always enabled in all kernels as
* long as the uffdio_api.api requested matches UFFD_API.
*/
-#if 0 /* not available yet */
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
-#endif
__u64 features;
__u64 ioctls;
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
+ LIST_HEAD(uf);
uprobe_start_dup_mmap();
if (down_write_killable(&oldmm->mmap_sem)) {
if (retval)
goto fail_nomem_policy;
tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &=
- ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+ tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
- tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;