Orangefs: kernel client part 5
authorMike Marshall <hubcap@omnibond.com>
Fri, 17 Jul 2015 14:38:15 +0000 (10:38 -0400)
committerMike Marshall <hubcap@omnibond.com>
Sat, 3 Oct 2015 15:39:57 +0000 (11:39 -0400)
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
fs/orangefs/super.c [new file with mode: 0644]
fs/orangefs/symlink.c [new file with mode: 0644]
fs/orangefs/waitqueue.c [new file with mode: 0644]
fs/orangefs/xattr.c [new file with mode: 0644]

diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
new file mode 100644 (file)
index 0000000..a854390
--- /dev/null
@@ -0,0 +1,558 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+#include <linux/parser.h>
+
+/* a cache for pvfs2-inode objects (i.e. pvfs2 inode private data) */
+static struct kmem_cache *pvfs2_inode_cache;
+
+/* list for storing pvfs2 specific superblocks in use */
+LIST_HEAD(pvfs2_superblocks);
+
+DEFINE_SPINLOCK(pvfs2_superblocks_lock);
+
+enum {
+       Opt_intr,
+       Opt_acl,
+       Opt_local_lock,
+
+       Opt_err
+};
+
+static const match_table_t tokens = {
+       { Opt_acl,              "acl" },
+       { Opt_intr,             "intr" },
+       { Opt_local_lock,       "local_lock" },
+       { Opt_err,      NULL }
+};
+
+
+static int parse_mount_options(struct super_block *sb, char *options,
+               int silent)
+{
+       struct pvfs2_sb_info_s *pvfs2_sb = PVFS2_SB(sb);
+       substring_t args[MAX_OPT_ARGS];
+       char *p;
+
+       /*
+        * Force any potential flags that might be set from the mount
+        * to zero, ie, initialize to unset.
+        */
+       sb->s_flags &= ~MS_POSIXACL;
+       pvfs2_sb->flags &= ~PVFS2_OPT_INTR;
+       pvfs2_sb->flags &= ~PVFS2_OPT_LOCAL_LOCK;
+
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+
+               if (!*p)
+                       continue;
+
+               token = match_token(p, tokens, args);
+               switch (token) {
+               case Opt_acl:
+                       sb->s_flags |= MS_POSIXACL;
+                       break;
+               case Opt_intr:
+                       pvfs2_sb->flags |= PVFS2_OPT_INTR;
+                       break;
+               case Opt_local_lock:
+                       pvfs2_sb->flags |= PVFS2_OPT_LOCAL_LOCK;
+                       break;
+               default:
+                       goto fail;
+               }
+       }
+
+       return 0;
+fail:
+       if (!silent)
+               gossip_err("Error: mount option [%s] is not supported.\n", p);
+       return -EINVAL;
+}
+
+static void pvfs2_inode_cache_ctor(void *req)
+{
+       struct pvfs2_inode_s *pvfs2_inode = req;
+
+       inode_init_once(&pvfs2_inode->vfs_inode);
+       init_rwsem(&pvfs2_inode->xattr_sem);
+
+       pvfs2_inode->vfs_inode.i_version = 1;
+}
+
+static struct inode *pvfs2_alloc_inode(struct super_block *sb)
+{
+       struct pvfs2_inode_s *pvfs2_inode;
+
+       pvfs2_inode = kmem_cache_alloc(pvfs2_inode_cache,
+                                      PVFS2_CACHE_ALLOC_FLAGS);
+       if (pvfs2_inode == NULL) {
+               gossip_err("Failed to allocate pvfs2_inode\n");
+               return NULL;
+       }
+
+       /*
+        * We want to clear everything except for rw_semaphore and the
+        * vfs_inode.
+        */
+       memset(&pvfs2_inode->refn.khandle, 0, 16);
+       pvfs2_inode->refn.fs_id = PVFS_FS_ID_NULL;
+       pvfs2_inode->last_failed_block_index_read = 0;
+       memset(pvfs2_inode->link_target, 0, sizeof(pvfs2_inode->link_target));
+       pvfs2_inode->pinode_flags = 0;
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_alloc_inode: allocated %p\n",
+                    &pvfs2_inode->vfs_inode);
+       return &pvfs2_inode->vfs_inode;
+}
+
+static void pvfs2_destroy_inode(struct inode *inode)
+{
+       struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                       "%s: deallocated %p destroying inode %pU\n",
+                       __func__, pvfs2_inode, get_khandle_from_ino(inode));
+
+       kmem_cache_free(pvfs2_inode_cache, pvfs2_inode);
+}
+
+/*
+ * NOTE: information filled in here is typically reflected in the
+ * output of the system command 'df'
+*/
+static int pvfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+       int ret = -ENOMEM;
+       struct pvfs2_kernel_op_s *new_op = NULL;
+       int flags = 0;
+       struct super_block *sb = NULL;
+
+       sb = dentry->d_sb;
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_statfs: called on sb %p (fs_id is %d)\n",
+                    sb,
+                    (int)(PVFS2_SB(sb)->fs_id));
+
+       new_op = op_alloc(PVFS2_VFS_OP_STATFS);
+       if (!new_op)
+               return ret;
+       new_op->upcall.req.statfs.fs_id = PVFS2_SB(sb)->fs_id;
+
+       if (PVFS2_SB(sb)->flags & PVFS2_OPT_INTR)
+               flags = PVFS2_OP_INTERRUPTIBLE;
+
+       ret = service_operation(new_op, "pvfs2_statfs", flags);
+
+       if (new_op->downcall.status < 0)
+               goto out_op_release;
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_statfs: got %ld blocks available | "
+                    "%ld blocks total | %ld block size\n",
+                    (long)new_op->downcall.resp.statfs.blocks_avail,
+                    (long)new_op->downcall.resp.statfs.blocks_total,
+                    (long)new_op->downcall.resp.statfs.block_size);
+
+       buf->f_type = sb->s_magic;
+       memcpy(&buf->f_fsid, &PVFS2_SB(sb)->fs_id, sizeof(buf->f_fsid));
+       buf->f_bsize = new_op->downcall.resp.statfs.block_size;
+       buf->f_namelen = PVFS2_NAME_LEN;
+
+       buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total;
+       buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+       buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+       buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
+       buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
+       buf->f_frsize = sb->s_blocksize;
+
+out_op_release:
+       op_release(new_op);
+       gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_statfs: returning %d\n", ret);
+       return ret;
+}
+
+/*
+ * Remount as initiated by VFS layer.  We just need to reparse the mount
+ * options, no need to signal pvfs2-client-core about it.
+ */
+static int pvfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+       gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_remount_fs: called\n");
+       return parse_mount_options(sb, data, 1);
+}
+
+/*
+ * Remount as initiated by pvfs2-client-core on restart.  This is used to
+ * repopulate mount information left from previous pvfs2-client-core.
+ *
+ * the idea here is that given a valid superblock, we're
+ * re-initializing the user space client with the initial mount
+ * information specified when the super block was first initialized.
+ * this is very different than the first initialization/creation of a
+ * superblock.  we use the special service_priority_operation to make
+ * sure that the mount gets ahead of any other pending operation that
+ * is waiting for servicing.  this means that the pvfs2-client won't
+ * fail to start several times for all other pending operations before
+ * the client regains all of the mount information from us.
+ * NOTE: this function assumes that the request_mutex is already acquired!
+ */
+int pvfs2_remount(struct super_block *sb)
+{
+       struct pvfs2_kernel_op_s *new_op;
+       int ret = -EINVAL;
+
+       gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_remount: called\n");
+
+       new_op = op_alloc(PVFS2_VFS_OP_FS_MOUNT);
+       if (!new_op)
+               return -ENOMEM;
+       strncpy(new_op->upcall.req.fs_mount.pvfs2_config_server,
+               PVFS2_SB(sb)->devname,
+               PVFS_MAX_SERVER_ADDR_LEN);
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "Attempting PVFS2 Remount via host %s\n",
+                    new_op->upcall.req.fs_mount.pvfs2_config_server);
+
+       /*
+        * we assume that the calling function has already acquire the
+        * request_mutex to prevent other operations from bypassing
+        * this one
+        */
+       ret = service_operation(new_op, "pvfs2_remount",
+               PVFS2_OP_PRIORITY | PVFS2_OP_NO_SEMAPHORE);
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_remount: mount got return value of %d\n",
+                    ret);
+       if (ret == 0) {
+               /*
+                * store the id assigned to this sb -- it's just a
+                * short-lived mapping that the system interface uses
+                * to map this superblock to a particular mount entry
+                */
+               PVFS2_SB(sb)->id = new_op->downcall.resp.fs_mount.id;
+               PVFS2_SB(sb)->mount_pending = 0;
+       }
+
+       op_release(new_op);
+       return ret;
+}
+
+int fsid_key_table_initialize(void)
+{
+       return 0;
+}
+
+void fsid_key_table_finalize(void)
+{
+}
+
+/* Called whenever the VFS dirties the inode in response to atime updates */
+static void pvfs2_dirty_inode(struct inode *inode, int flags)
+{
+       struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_dirty_inode: %pU\n",
+                    get_khandle_from_ino(inode));
+       SetAtimeFlag(pvfs2_inode);
+}
+
+struct super_operations pvfs2_s_ops = {
+       .alloc_inode = pvfs2_alloc_inode,
+       .destroy_inode = pvfs2_destroy_inode,
+       .dirty_inode = pvfs2_dirty_inode,
+       .drop_inode = generic_delete_inode,
+       .statfs = pvfs2_statfs,
+       .remount_fs = pvfs2_remount_fs,
+       .show_options = generic_show_options,
+};
+
+struct dentry *pvfs2_fh_to_dentry(struct super_block *sb,
+                                 struct fid *fid,
+                                 int fh_len,
+                                 int fh_type)
+{
+       struct pvfs2_object_kref refn;
+
+       if (fh_len < 5 || fh_type > 2)
+               return NULL;
+
+       PVFS_khandle_from(&(refn.khandle), fid->raw, 16);
+       refn.fs_id = (u32) fid->raw[4];
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "fh_to_dentry: handle %pU, fs_id %d\n",
+                    &refn.khandle,
+                    refn.fs_id);
+
+       return d_obtain_alias(pvfs2_iget(sb, &refn));
+}
+
+int pvfs2_encode_fh(struct inode *inode,
+                   __u32 *fh,
+                   int *max_len,
+                   struct inode *parent)
+{
+       int len = parent ? 10 : 5;
+       int type = 1;
+       struct pvfs2_object_kref refn;
+
+       if (*max_len < len) {
+               gossip_lerr("fh buffer is too small for encoding\n");
+               *max_len = len;
+               type = 255;
+               goto out;
+       }
+
+       refn = PVFS2_I(inode)->refn;
+       PVFS_khandle_to(&refn.khandle, fh, 16);
+       fh[4] = refn.fs_id;
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "Encoding fh: handle %pU, fsid %u\n",
+                    &refn.khandle,
+                    refn.fs_id);
+
+
+       if (parent) {
+               refn = PVFS2_I(parent)->refn;
+               PVFS_khandle_to(&refn.khandle, (char *) fh + 20, 16);
+               fh[9] = refn.fs_id;
+
+               type = 2;
+               gossip_debug(GOSSIP_SUPER_DEBUG,
+                            "Encoding parent: handle %pU, fsid %u\n",
+                            &refn.khandle,
+                            refn.fs_id);
+       }
+       *max_len = len;
+
+out:
+       return type;
+}
+
+static struct export_operations pvfs2_export_ops = {
+       .encode_fh = pvfs2_encode_fh,
+       .fh_to_dentry = pvfs2_fh_to_dentry,
+};
+
+int pvfs2_fill_sb(struct super_block *sb, void *data, int silent)
+{
+       int ret = -EINVAL;
+       struct inode *root = NULL;
+       struct dentry *root_dentry = NULL;
+       struct pvfs2_mount_sb_info_s *mount_sb_info =
+               (struct pvfs2_mount_sb_info_s *) data;
+       struct pvfs2_object_kref root_object;
+
+       /* alloc and init our private pvfs2 sb info */
+       sb->s_fs_info =
+               kmalloc(sizeof(struct pvfs2_sb_info_s), PVFS2_GFP_FLAGS);
+       if (!PVFS2_SB(sb))
+               return -ENOMEM;
+       memset(sb->s_fs_info, 0, sizeof(struct pvfs2_sb_info_s));
+       PVFS2_SB(sb)->sb = sb;
+
+       PVFS2_SB(sb)->root_khandle = mount_sb_info->root_khandle;
+       PVFS2_SB(sb)->fs_id = mount_sb_info->fs_id;
+       PVFS2_SB(sb)->id = mount_sb_info->id;
+
+       if (mount_sb_info->data) {
+               ret = parse_mount_options(sb, mount_sb_info->data,
+                                         silent);
+               if (ret)
+                       return ret;
+       }
+
+       /* Hang the xattr handlers off the superblock */
+       sb->s_xattr = pvfs2_xattr_handlers;
+       sb->s_magic = PVFS2_SUPER_MAGIC;
+       sb->s_op = &pvfs2_s_ops;
+       sb->s_d_op = &pvfs2_dentry_operations;
+
+       sb->s_blocksize = pvfs_bufmap_size_query();
+       sb->s_blocksize_bits = pvfs_bufmap_shift_query();
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+       root_object.khandle = PVFS2_SB(sb)->root_khandle;
+       root_object.fs_id = PVFS2_SB(sb)->fs_id;
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "get inode %pU, fsid %d\n",
+                    &root_object.khandle,
+                    root_object.fs_id);
+
+       root = pvfs2_iget(sb, &root_object);
+       if (IS_ERR(root))
+               return PTR_ERR(root);
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "Allocated root inode [%p] with mode %x\n",
+                    root,
+                    root->i_mode);
+
+       /* allocates and places root dentry in dcache */
+       root_dentry = d_make_root(root);
+       if (!root_dentry) {
+               iput(root);
+               return -ENOMEM;
+       }
+
+       sb->s_export_op = &pvfs2_export_ops;
+       sb->s_root = root_dentry;
+       return 0;
+}
+
+struct dentry *pvfs2_mount(struct file_system_type *fst,
+                          int flags,
+                          const char *devname,
+                          void *data)
+{
+       int ret = -EINVAL;
+       struct super_block *sb = ERR_PTR(-EINVAL);
+       struct pvfs2_kernel_op_s *new_op;
+       struct pvfs2_mount_sb_info_s mount_sb_info;
+       struct dentry *mnt_sb_d = ERR_PTR(-EINVAL);
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_mount: called with devname %s\n",
+                    devname);
+
+       if (!devname) {
+               gossip_err("ERROR: device name not specified.\n");
+               return ERR_PTR(-EINVAL);
+       }
+
+       new_op = op_alloc(PVFS2_VFS_OP_FS_MOUNT);
+       if (!new_op)
+               return ERR_PTR(-ENOMEM);
+
+       strncpy(new_op->upcall.req.fs_mount.pvfs2_config_server,
+               devname,
+               PVFS_MAX_SERVER_ADDR_LEN);
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "Attempting PVFS2 Mount via host %s\n",
+                    new_op->upcall.req.fs_mount.pvfs2_config_server);
+
+       ret = service_operation(new_op, "pvfs2_mount", 0);
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_mount: mount got return value of %d\n", ret);
+       if (ret)
+               goto free_op;
+
+       if (new_op->downcall.resp.fs_mount.fs_id == PVFS_FS_ID_NULL) {
+               gossip_err("ERROR: Retrieved null fs_id\n");
+               ret = -EINVAL;
+               goto free_op;
+       }
+
+       /* fill in temporary structure passed to fill_sb method */
+       mount_sb_info.data = data;
+       mount_sb_info.root_khandle =
+               new_op->downcall.resp.fs_mount.root_khandle;
+       mount_sb_info.fs_id = new_op->downcall.resp.fs_mount.fs_id;
+       mount_sb_info.id = new_op->downcall.resp.fs_mount.id;
+
+       /*
+        * the mount_sb_info structure looks odd, but it's used because
+        * the private sb info isn't allocated until we call
+        * pvfs2_fill_sb, yet we have the info we need to fill it with
+        * here.  so we store it temporarily and pass all of the info
+        * to fill_sb where it's properly copied out
+        */
+       mnt_sb_d = mount_nodev(fst,
+                              flags,
+                              (void *)&mount_sb_info,
+                              pvfs2_fill_sb);
+       if (IS_ERR(mnt_sb_d)) {
+               sb = ERR_CAST(mnt_sb_d);
+               goto free_op;
+       }
+
+       sb = mnt_sb_d->d_sb;
+
+       /*
+        * on successful mount, store the devname and data
+        * used
+        */
+       strncpy(PVFS2_SB(sb)->devname,
+               devname,
+               PVFS_MAX_SERVER_ADDR_LEN);
+
+       /* mount_pending must be cleared */
+       PVFS2_SB(sb)->mount_pending = 0;
+
+       /*
+        * finally, add this sb to our list of known pvfs2
+        * sb's
+        */
+       add_pvfs2_sb(sb);
+       op_release(new_op);
+       return mnt_sb_d;
+
+free_op:
+       gossip_err("pvfs2_mount: mount request failed with %d\n", ret);
+       if (ret == -EINVAL) {
+               gossip_err("Ensure that all pvfs2-servers have the same FS configuration files\n");
+               gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n");
+       }
+
+       op_release(new_op);
+
+       gossip_debug(GOSSIP_SUPER_DEBUG,
+                    "pvfs2_mount: returning dentry %p\n",
+                    mnt_sb_d);
+       return mnt_sb_d;
+}
+
+void pvfs2_kill_sb(struct super_block *sb)
+{
+       gossip_debug(GOSSIP_SUPER_DEBUG, "pvfs2_kill_sb: called\n");
+
+       /*
+        * issue the unmount to userspace to tell it to remove the
+        * dynamic mount info it has for this superblock
+        */
+       pvfs2_unmount_sb(sb);
+
+       /* remove the sb from our list of pvfs2 specific sb's */
+       remove_pvfs2_sb(sb);
+
+       /* provided sb cleanup */
+       kill_anon_super(sb);
+
+       /* free the pvfs2 superblock private data */
+       kfree(PVFS2_SB(sb));
+}
+
+int pvfs2_inode_cache_initialize(void)
+{
+       pvfs2_inode_cache = kmem_cache_create("pvfs2_inode_cache",
+                                             sizeof(struct pvfs2_inode_s),
+                                             0,
+                                             PVFS2_CACHE_CREATE_FLAGS,
+                                             pvfs2_inode_cache_ctor);
+
+       if (!pvfs2_inode_cache) {
+               gossip_err("Cannot create pvfs2_inode_cache\n");
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+int pvfs2_inode_cache_finalize(void)
+{
+       kmem_cache_destroy(pvfs2_inode_cache);
+       return 0;
+}
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
new file mode 100644 (file)
index 0000000..2adfcef
--- /dev/null
@@ -0,0 +1,31 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+static const char *pvfs2_follow_link(struct dentry *dentry, void **cookie)
+{
+       char *target =  PVFS2_I(dentry->d_inode)->link_target;
+
+       gossip_debug(GOSSIP_INODE_DEBUG,
+                    "%s: called on %s (target is %p)\n",
+                    __func__, (char *)dentry->d_name.name, target);
+
+       *cookie = target;
+
+       return target;
+}
+
+struct inode_operations pvfs2_symlink_inode_operations = {
+       .readlink = generic_readlink,
+       .follow_link = pvfs2_follow_link,
+       .setattr = pvfs2_setattr,
+       .getattr = pvfs2_getattr,
+       .listxattr = pvfs2_listxattr,
+       .setxattr = generic_setxattr,
+};
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
new file mode 100644 (file)
index 0000000..9b32286
--- /dev/null
@@ -0,0 +1,522 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ * (C) 2011 Omnibond Systems
+ *
+ * Changes by Acxiom Corporation to implement generic service_operation()
+ * function, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  In-kernel waitqueue operations.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+/*
+ * What we do in this function is to walk the list of operations that are
+ * present in the request queue and mark them as purged.
+ * NOTE: This is called from the device close after client-core has
+ * guaranteed that no new operations could appear on the list since the
+ * client-core is anyway going to exit.
+ */
+void purge_waiting_ops(void)
+{
+       struct pvfs2_kernel_op_s *op;
+
+       spin_lock(&pvfs2_request_list_lock);
+       list_for_each_entry(op, &pvfs2_request_list, list) {
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "pvfs2-client-core: purging op tag %llu %s\n",
+                            llu(op->tag),
+                            get_opname_string(op));
+               spin_lock(&op->lock);
+               set_op_state_purged(op);
+               spin_unlock(&op->lock);
+               wake_up_interruptible(&op->waitq);
+       }
+       spin_unlock(&pvfs2_request_list_lock);
+}
+
+/*
+ * submits a PVFS2 operation and waits for it to complete
+ *
+ * Note op->downcall.status will contain the status of the operation (in
+ * errno format), whether provided by pvfs2-client or a result of failure to
+ * service the operation.  If the caller wishes to distinguish, then
+ * op->state can be checked to see if it was serviced or not.
+ *
+ * Returns contents of op->downcall.status for convenience
+ */
+int service_operation(struct pvfs2_kernel_op_s *op,
+                     const char *op_name,
+                     int flags)
+{
+       /* flags to modify behavior */
+       sigset_t orig_sigset;
+       int ret = 0;
+
+       /* irqflags and wait_entry are only used IF the client-core aborts */
+       unsigned long irqflags;
+
+       DECLARE_WAITQUEUE(wait_entry, current);
+
+       op->upcall.tgid = current->tgid;
+       op->upcall.pid = current->pid;
+
+retry_servicing:
+       op->downcall.status = 0;
+       gossip_debug(GOSSIP_WAIT_DEBUG,
+                    "pvfs2: service_operation: %s %p\n",
+                    op_name,
+                    op);
+       gossip_debug(GOSSIP_WAIT_DEBUG,
+                    "pvfs2: operation posted by process: %s, pid: %i\n",
+                    current->comm,
+                    current->pid);
+
+       /* mask out signals if this operation is not to be interrupted */
+       if (!(flags & PVFS2_OP_INTERRUPTIBLE))
+               mask_blocked_signals(&orig_sigset);
+
+       if (!(flags & PVFS2_OP_NO_SEMAPHORE)) {
+               ret = mutex_lock_interruptible(&request_mutex);
+               /*
+                * check to see if we were interrupted while waiting for
+                * semaphore
+                */
+               if (ret < 0) {
+                       if (!(flags & PVFS2_OP_INTERRUPTIBLE))
+                               unmask_blocked_signals(&orig_sigset);
+                       op->downcall.status = ret;
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "pvfs2: service_operation interrupted.\n");
+                       return ret;
+               }
+       }
+
+       gossip_debug(GOSSIP_WAIT_DEBUG,
+                    "%s:About to call is_daemon_in_service().\n",
+                    __func__);
+
+       if (is_daemon_in_service() < 0) {
+               /*
+                * By incrementing the per-operation attempt counter, we
+                * directly go into the timeout logic while waiting for
+                * the matching downcall to be read
+                */
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "%s:client core is NOT in service(%d).\n",
+                            __func__,
+                            is_daemon_in_service());
+               op->attempts++;
+       }
+
+       /* queue up the operation */
+       if (flags & PVFS2_OP_PRIORITY) {
+               add_priority_op_to_request_list(op);
+       } else {
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "%s:About to call add_op_to_request_list().\n",
+                            __func__);
+               add_op_to_request_list(op);
+       }
+
+       if (!(flags & PVFS2_OP_NO_SEMAPHORE))
+               mutex_unlock(&request_mutex);
+
+       /*
+        * If we are asked to service an asynchronous operation from
+        * VFS perspective, we are done.
+        */
+       if (flags & PVFS2_OP_ASYNC)
+               return 0;
+
+       if (flags & PVFS2_OP_CANCELLATION) {
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "%s:"
+                            "About to call wait_for_cancellation_downcall.\n",
+                            __func__);
+               ret = wait_for_cancellation_downcall(op);
+       } else {
+               ret = wait_for_matching_downcall(op);
+       }
+
+       if (ret < 0) {
+               /* failed to get matching downcall */
+               if (ret == -ETIMEDOUT) {
+                       gossip_err("pvfs2: %s -- wait timed out; aborting attempt.\n",
+                                  op_name);
+               }
+               op->downcall.status = ret;
+       } else {
+               /* got matching downcall; make sure status is in errno format */
+               op->downcall.status =
+                   pvfs2_normalize_to_errno(op->downcall.status);
+               ret = op->downcall.status;
+       }
+
+       if (!(flags & PVFS2_OP_INTERRUPTIBLE))
+               unmask_blocked_signals(&orig_sigset);
+
+       BUG_ON(ret != op->downcall.status);
+       /* retry if operation has not been serviced and if requested */
+       if (!op_state_serviced(op) && op->downcall.status == -EAGAIN) {
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "pvfs2: tag %llu (%s)"
+                            " -- operation to be retried (%d attempt)\n",
+                            llu(op->tag),
+                            op_name,
+                            op->attempts + 1);
+
+               if (!op->uses_shared_memory)
+                       /*
+                        * this operation doesn't use the shared memory
+                        * system
+                        */
+                       goto retry_servicing;
+
+               /* op uses shared memory */
+               if (get_bufmap_init() == 0) {
+                       /*
+                        * This operation uses the shared memory system AND
+                        * the system is not yet ready. This situation occurs
+                        * when the client-core is restarted AND there were
+                        * operations waiting to be processed or were already
+                        * in process.
+                        */
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "uses_shared_memory is true.\n");
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "Client core in-service status(%d).\n",
+                                    is_daemon_in_service());
+                       gossip_debug(GOSSIP_WAIT_DEBUG, "bufmap_init:%d.\n",
+                                    get_bufmap_init());
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "operation's status is 0x%0x.\n",
+                                    op->op_state);
+
+                       /*
+                        * let process sleep for a few seconds so shared
+                        * memory system can be initialized.
+                        */
+                       spin_lock_irqsave(&op->lock, irqflags);
+                       add_wait_queue(&pvfs2_bufmap_init_waitq, &wait_entry);
+                       spin_unlock_irqrestore(&op->lock, irqflags);
+
+                       set_current_state(TASK_INTERRUPTIBLE);
+
+                       /*
+                        * Wait for pvfs_bufmap_initialize() to wake me up
+                        * within the allotted time.
+                        */
+                       ret = schedule_timeout(MSECS_TO_JIFFIES
+                               (1000 * PVFS2_BUFMAP_WAIT_TIMEOUT_SECS));
+
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "Value returned from schedule_timeout:"
+                                    "%d.\n",
+                                    ret);
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "Is shared memory available? (%d).\n",
+                                    get_bufmap_init());
+
+                       spin_lock_irqsave(&op->lock, irqflags);
+                       remove_wait_queue(&pvfs2_bufmap_init_waitq,
+                                         &wait_entry);
+                       spin_unlock_irqrestore(&op->lock, irqflags);
+
+                       if (get_bufmap_init() == 0) {
+                               gossip_err("%s:The shared memory system has not started in %d seconds after the client core restarted.  Aborting user's request(%s).\n",
+                                          __func__,
+                                          PVFS2_BUFMAP_WAIT_TIMEOUT_SECS,
+                                          get_opname_string(op));
+                               return -EIO;
+                       }
+
+                       /*
+                        * Return to the calling function and re-populate a
+                        * shared memory buffer.
+                        */
+                       return -EAGAIN;
+               }
+       }
+
+       gossip_debug(GOSSIP_WAIT_DEBUG,
+                    "pvfs2: service_operation %s returning: %d for %p.\n",
+                    op_name,
+                    ret,
+                    op);
+       return ret;
+}
+
+void pvfs2_clean_up_interrupted_operation(struct pvfs2_kernel_op_s *op)
+{
+       /*
+        * handle interrupted cases depending on what state we were in when
+        * the interruption is detected.  there is a coarse grained lock
+        * across the operation.
+        *
+        * NOTE: be sure not to reverse lock ordering by locking an op lock
+        * while holding the request_list lock.  Here, we first lock the op
+        * and then lock the appropriate list.
+        */
+       if (!op) {
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                           "%s: op is null, ignoring\n",
+                            __func__);
+               return;
+       }
+
+       /*
+        * one more sanity check, make sure it's in one of the possible states
+        * or don't try to cancel it
+        */
+       if (!(op_state_waiting(op) ||
+             op_state_in_progress(op) ||
+             op_state_serviced(op) ||
+             op_state_purged(op))) {
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "%s: op %p not in a valid state (%0x), "
+                            "ignoring\n",
+                            __func__,
+                            op,
+                            op->op_state);
+               return;
+       }
+
+       spin_lock(&op->lock);
+
+       if (op_state_waiting(op)) {
+               /*
+                * upcall hasn't been read; remove op from upcall request
+                * list.
+                */
+               spin_unlock(&op->lock);
+               remove_op_from_request_list(op);
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "Interrupted: Removed op %p from request_list\n",
+                            op);
+       } else if (op_state_in_progress(op)) {
+               /* op must be removed from the in progress htable */
+               spin_unlock(&op->lock);
+               spin_lock(&htable_ops_in_progress_lock);
+               list_del(&op->list);
+               spin_unlock(&htable_ops_in_progress_lock);
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "Interrupted: Removed op %p"
+                            " from htable_ops_in_progress\n",
+                            op);
+       } else if (!op_state_serviced(op)) {
+               spin_unlock(&op->lock);
+               gossip_err("interrupted operation is in a weird state 0x%x\n",
+                          op->op_state);
+       }
+}
+
+/*
+ * sleeps on waitqueue waiting for matching downcall.
+ * if client-core finishes servicing, then we are good to go.
+ * else if client-core exits, we get woken up here, and retry with a timeout
+ *
+ * Post when this call returns to the caller, the specified op will no
+ * longer be on any list or htable.
+ *
+ * Returns 0 on success and -errno on failure
+ * Errors are:
+ * EAGAIN in case we want the caller to requeue and try again..
+ * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this
+ * operation since client-core seems to be exiting too often
+ * or if we were interrupted.
+ */
+int wait_for_matching_downcall(struct pvfs2_kernel_op_s *op)
+{
+       int ret = -EINVAL;
+       DECLARE_WAITQUEUE(wait_entry, current);
+
+       spin_lock(&op->lock);
+       add_wait_queue(&op->waitq, &wait_entry);
+       spin_unlock(&op->lock);
+
+       while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               spin_lock(&op->lock);
+               if (op_state_serviced(op)) {
+                       spin_unlock(&op->lock);
+                       ret = 0;
+                       break;
+               }
+               spin_unlock(&op->lock);
+
+               if (!signal_pending(current)) {
+                       /*
+                        * if this was our first attempt and client-core
+                        * has not purged our operation, we are happy to
+                        * simply wait
+                        */
+                       spin_lock(&op->lock);
+                       if (op->attempts == 0 && !op_state_purged(op)) {
+                               spin_unlock(&op->lock);
+                               schedule();
+                       } else {
+                               spin_unlock(&op->lock);
+                               /*
+                                * subsequent attempts, we retry exactly once
+                                * with timeouts
+                                */
+                               if (!schedule_timeout(MSECS_TO_JIFFIES
+                                     (1000 * op_timeout_secs))) {
+                                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                                    "*** %s:"
+                                                    " operation timed out (tag"
+                                                    " %llu, %p, att %d)\n",
+                                                    __func__,
+                                                    llu(op->tag),
+                                                    op,
+                                                    op->attempts);
+                                       ret = -ETIMEDOUT;
+                                       pvfs2_clean_up_interrupted_operation
+                                           (op);
+                                       break;
+                               }
+                       }
+                       spin_lock(&op->lock);
+                       op->attempts++;
+                       /*
+                        * if the operation was purged in the meantime, it
+                        * is better to requeue it afresh but ensure that
+                        * we have not been purged repeatedly. This could
+                        * happen if client-core crashes when an op
+                        * is being serviced, so we requeue the op, client
+                        * core crashes again so we requeue the op, client
+                        * core starts, and so on...
+                        */
+                       if (op_state_purged(op)) {
+                               ret = (op->attempts < PVFS2_PURGE_RETRY_COUNT) ?
+                                        -EAGAIN :
+                                        -EIO;
+                               spin_unlock(&op->lock);
+                               gossip_debug(GOSSIP_WAIT_DEBUG,
+                                            "*** %s:"
+                                            " operation purged (tag "
+                                            "%llu, %p, att %d)\n",
+                                            __func__,
+                                            llu(op->tag),
+                                            op,
+                                            op->attempts);
+                               pvfs2_clean_up_interrupted_operation(op);
+                               break;
+                       }
+                       spin_unlock(&op->lock);
+                       continue;
+               }
+
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "*** %s:"
+                            " operation interrupted by a signal (tag "
+                            "%llu, op %p)\n",
+                            __func__,
+                            llu(op->tag),
+                            op);
+               pvfs2_clean_up_interrupted_operation(op);
+               ret = -EINTR;
+               break;
+       }
+
+       set_current_state(TASK_RUNNING);
+
+       spin_lock(&op->lock);
+       remove_wait_queue(&op->waitq, &wait_entry);
+       spin_unlock(&op->lock);
+
+       return ret;
+}
+
+/*
+ * similar to wait_for_matching_downcall(), but used in the special case
+ * of I/O cancellations.
+ *
+ * Note we need a special wait function because if this is called we already
+ *      know that a signal is pending in current and need to service the
+ *      cancellation upcall anyway.  the only way to exit this is to either
+ *      timeout or have the cancellation be serviced properly.
+ */
+int wait_for_cancellation_downcall(struct pvfs2_kernel_op_s *op)
+{
+       int ret = -EINVAL;
+       DECLARE_WAITQUEUE(wait_entry, current);
+
+       spin_lock(&op->lock);
+       add_wait_queue(&op->waitq, &wait_entry);
+       spin_unlock(&op->lock);
+
+       while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               spin_lock(&op->lock);
+               if (op_state_serviced(op)) {
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "%s:op-state is SERVICED.\n",
+                                    __func__);
+                       spin_unlock(&op->lock);
+                       ret = 0;
+                       break;
+               }
+               spin_unlock(&op->lock);
+
+               if (signal_pending(current)) {
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "%s:operation interrupted by a signal (tag"
+                                    " %llu, op %p)\n",
+                                    __func__,
+                                    llu(op->tag),
+                                    op);
+                       pvfs2_clean_up_interrupted_operation(op);
+                       ret = -EINTR;
+                       break;
+               }
+
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "%s:About to call schedule_timeout.\n",
+                            __func__);
+               ret =
+                   schedule_timeout(MSECS_TO_JIFFIES(1000 * op_timeout_secs));
+
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "%s:Value returned from schedule_timeout(%d).\n",
+                            __func__,
+                            ret);
+               if (!ret) {
+                       gossip_debug(GOSSIP_WAIT_DEBUG,
+                                    "%s:*** operation timed out: %p\n",
+                                    __func__,
+                                    op);
+                       pvfs2_clean_up_interrupted_operation(op);
+                       ret = -ETIMEDOUT;
+                       break;
+               }
+
+               gossip_debug(GOSSIP_WAIT_DEBUG,
+                            "%s:Breaking out of loop, regardless of value returned by schedule_timeout.\n",
+                            __func__);
+               ret = -ETIMEDOUT;
+               break;
+       }
+
+       set_current_state(TASK_RUNNING);
+
+       spin_lock(&op->lock);
+       remove_wait_queue(&op->waitq, &wait_entry);
+       spin_unlock(&op->lock);
+
+       gossip_debug(GOSSIP_WAIT_DEBUG,
+                    "%s:returning ret(%d)\n",
+                    __func__,
+                    ret);
+
+       return ret;
+}
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
new file mode 100644 (file)
index 0000000..2766090
--- /dev/null
@@ -0,0 +1,532 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Linux VFS extended attribute operations.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+#define SYSTEM_PVFS2_KEY "system.pvfs2."
+#define SYSTEM_PVFS2_KEY_LEN 13
+
+/*
+ * this function returns
+ *   0 if the key corresponding to name is not meant to be printed as part
+ *     of a listxattr.
+ *   1 if the key corresponding to name is meant to be returned as part of
+ *     a listxattr.
+ * The ones that start SYSTEM_PVFS2_KEY are the ones to avoid printing.
+ */
+static int is_reserved_key(const char *key, size_t size)
+{
+
+       if (size < SYSTEM_PVFS2_KEY_LEN)
+               return 1;
+
+       return strncmp(key, SYSTEM_PVFS2_KEY, SYSTEM_PVFS2_KEY_LEN) ?  1 : 0;
+}
+
+static inline int convert_to_internal_xattr_flags(int setxattr_flags)
+{
+       int internal_flag = 0;
+
+       if (setxattr_flags & XATTR_REPLACE) {
+               /* Attribute must exist! */
+               internal_flag = PVFS_XATTR_REPLACE;
+       } else if (setxattr_flags & XATTR_CREATE) {
+               /* Attribute must not exist */
+               internal_flag = PVFS_XATTR_CREATE;
+       }
+       return internal_flag;
+}
+
+
+/*
+ * Tries to get a specified key's attributes of a given
+ * file into a user-specified buffer. Note that the getxattr
+ * interface allows for the users to probe the size of an
+ * extended attribute by passing in a value of 0 to size.
+ * Thus our return value is always the size of the attribute
+ * unless the key does not exist for the file and/or if
+ * there were errors in fetching the attribute value.
+ */
+ssize_t pvfs2_inode_getxattr(struct inode *inode, const char *prefix,
+               const char *name, void *buffer, size_t size)
+{
+       struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+       struct pvfs2_kernel_op_s *new_op = NULL;
+       ssize_t ret = -ENOMEM;
+       ssize_t length = 0;
+       int fsuid;
+       int fsgid;
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "%s: prefix %s name %s, buffer_size %zd\n",
+                    __func__, prefix, name, size);
+
+       if (name == NULL || (size > 0 && buffer == NULL)) {
+               gossip_err("pvfs2_inode_getxattr: bogus NULL pointers\n");
+               return -EINVAL;
+       }
+       if (size < 0 ||
+           (strlen(name) + strlen(prefix)) >= PVFS_MAX_XATTR_NAMELEN) {
+               gossip_err("Invalid size (%d) or key length (%d)\n",
+                          (int)size,
+                          (int)(strlen(name) + strlen(prefix)));
+               return -EINVAL;
+       }
+
+       fsuid = from_kuid(current_user_ns(), current_fsuid());
+       fsgid = from_kgid(current_user_ns(), current_fsgid());
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "getxattr on inode %pU, name %s "
+                    "(uid %o, gid %o)\n",
+                    get_khandle_from_ino(inode),
+                    name,
+                    fsuid,
+                    fsgid);
+
+       down_read(&pvfs2_inode->xattr_sem);
+
+       new_op = op_alloc(PVFS2_VFS_OP_GETXATTR);
+       if (!new_op)
+               goto out_unlock;
+
+       new_op->upcall.req.getxattr.refn = pvfs2_inode->refn;
+       ret = snprintf((char *)new_op->upcall.req.getxattr.key,
+                      PVFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name);
+
+       /*
+        * NOTE: Although keys are meant to be NULL terminated textual
+        * strings, I am going to explicitly pass the length just in case
+        * we change this later on...
+        */
+       new_op->upcall.req.getxattr.key_sz = ret + 1;
+
+       ret = service_operation(new_op, "pvfs2_inode_getxattr",
+                               get_interruptible_flag(inode));
+       if (ret != 0) {
+               if (ret == -ENOENT) {
+                       ret = -ENODATA;
+                       gossip_debug(GOSSIP_XATTR_DEBUG,
+                                    "pvfs2_inode_getxattr: inode %pU key %s"
+                                    " does not exist!\n",
+                                    get_khandle_from_ino(inode),
+                                    (char *)new_op->upcall.req.getxattr.key);
+               }
+               goto out_release_op;
+       }
+
+       /*
+        * Length returned includes null terminator.
+        */
+       length = new_op->downcall.resp.getxattr.val_sz;
+
+       /*
+        * Just return the length of the queried attribute.
+        */
+       if (size == 0) {
+               ret = length;
+               goto out_release_op;
+       }
+
+       /*
+        * Check to see if key length is > provided buffer size.
+        */
+       if (length > size) {
+               ret = -ERANGE;
+               goto out_release_op;
+       }
+
+       memset(buffer, 0, size);
+       memcpy(buffer, new_op->downcall.resp.getxattr.val, length);
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+            "pvfs2_inode_getxattr: inode %pU "
+            "key %s key_sz %d, val_len %d\n",
+            get_khandle_from_ino(inode),
+            (char *)new_op->
+               upcall.req.getxattr.key,
+                    (int)new_op->
+               upcall.req.getxattr.key_sz,
+            (int)ret);
+
+       ret = length;
+
+out_release_op:
+       op_release(new_op);
+out_unlock:
+       up_read(&pvfs2_inode->xattr_sem);
+       return ret;
+}
+
+static int pvfs2_inode_removexattr(struct inode *inode,
+                           const char *prefix,
+                           const char *name,
+                           int flags)
+{
+       struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+       struct pvfs2_kernel_op_s *new_op = NULL;
+       int ret = -ENOMEM;
+
+       down_write(&pvfs2_inode->xattr_sem);
+       new_op = op_alloc(PVFS2_VFS_OP_REMOVEXATTR);
+       if (!new_op)
+               goto out_unlock;
+
+       new_op->upcall.req.removexattr.refn = pvfs2_inode->refn;
+       /*
+        * NOTE: Although keys are meant to be NULL terminated
+        * textual strings, I am going to explicitly pass the
+        * length just in case we change this later on...
+        */
+       ret = snprintf((char *)new_op->upcall.req.removexattr.key,
+                      PVFS_MAX_XATTR_NAMELEN,
+                      "%s%s",
+                      (prefix ? prefix : ""),
+                      name);
+       new_op->upcall.req.removexattr.key_sz = ret + 1;
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "pvfs2_inode_removexattr: key %s, key_sz %d\n",
+                    (char *)new_op->upcall.req.removexattr.key,
+                    (int)new_op->upcall.req.removexattr.key_sz);
+
+       ret = service_operation(new_op,
+                               "pvfs2_inode_removexattr",
+                               get_interruptible_flag(inode));
+       if (ret == -ENOENT) {
+               /*
+                * Request to replace a non-existent attribute is an error.
+                */
+               if (flags & XATTR_REPLACE)
+                       ret = -ENODATA;
+               else
+                       ret = 0;
+       }
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "pvfs2_inode_removexattr: returning %d\n", ret);
+
+       op_release(new_op);
+out_unlock:
+       up_write(&pvfs2_inode->xattr_sem);
+       return ret;
+}
+
+/*
+ * Tries to set an attribute for a given key on a file.
+ *
+ * Returns a -ve number on error and 0 on success.  Key is text, but value
+ * can be binary!
+ */
+int pvfs2_inode_setxattr(struct inode *inode, const char *prefix,
+               const char *name, const void *value, size_t size, int flags)
+{
+       struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+       struct pvfs2_kernel_op_s *new_op;
+       int internal_flag = 0;
+       int ret = -ENOMEM;
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "%s: prefix %s, name %s, buffer_size %zd\n",
+                    __func__, prefix, name, size);
+
+       if (size < 0 ||
+           size >= PVFS_MAX_XATTR_VALUELEN ||
+           flags < 0) {
+               gossip_err("pvfs2_inode_setxattr: bogus values of size(%d), flags(%d)\n",
+                          (int)size,
+                          flags);
+               return -EINVAL;
+       }
+
+       if (name == NULL ||
+           (size > 0 && value == NULL)) {
+               gossip_err("pvfs2_inode_setxattr: bogus NULL pointers!\n");
+               return -EINVAL;
+       }
+
+       internal_flag = convert_to_internal_xattr_flags(flags);
+
+       if (prefix) {
+               if (strlen(name) + strlen(prefix) >= PVFS_MAX_XATTR_NAMELEN) {
+                       gossip_err
+                           ("pvfs2_inode_setxattr: bogus key size (%d)\n",
+                            (int)(strlen(name) + strlen(prefix)));
+                       return -EINVAL;
+               }
+       } else {
+               if (strlen(name) >= PVFS_MAX_XATTR_NAMELEN) {
+                       gossip_err
+                           ("pvfs2_inode_setxattr: bogus key size (%d)\n",
+                            (int)(strlen(name)));
+                       return -EINVAL;
+               }
+       }
+
+       /* This is equivalent to a removexattr */
+       if (size == 0 && value == NULL) {
+               gossip_debug(GOSSIP_XATTR_DEBUG,
+                            "removing xattr (%s%s)\n",
+                            prefix,
+                            name);
+               return pvfs2_inode_removexattr(inode, prefix, name, flags);
+       }
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "setxattr on inode %pU, name %s\n",
+                    get_khandle_from_ino(inode),
+                    name);
+
+       down_write(&pvfs2_inode->xattr_sem);
+       new_op = op_alloc(PVFS2_VFS_OP_SETXATTR);
+       if (!new_op)
+               goto out_unlock;
+
+
+       new_op->upcall.req.setxattr.refn = pvfs2_inode->refn;
+       new_op->upcall.req.setxattr.flags = internal_flag;
+       /*
+        * NOTE: Although keys are meant to be NULL terminated textual
+        * strings, I am going to explicitly pass the length just in
+        * case we change this later on...
+        */
+       ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key,
+                      PVFS_MAX_XATTR_NAMELEN,
+                      "%s%s",
+                      prefix, name);
+       new_op->upcall.req.setxattr.keyval.key_sz = ret + 1;
+       memcpy(new_op->upcall.req.setxattr.keyval.val, value, size);
+       new_op->upcall.req.setxattr.keyval.val_sz = size;
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "pvfs2_inode_setxattr: key %s, key_sz %d "
+                    " value size %zd\n",
+                    (char *)new_op->upcall.req.setxattr.keyval.key,
+                    (int)new_op->upcall.req.setxattr.keyval.key_sz,
+                    size);
+
+       ret = service_operation(new_op,
+                               "pvfs2_inode_setxattr",
+                               get_interruptible_flag(inode));
+
+       gossip_debug(GOSSIP_XATTR_DEBUG,
+                    "pvfs2_inode_setxattr: returning %d\n",
+                    ret);
+
+       /* when request is serviced properly, free req op struct */
+       op_release(new_op);
+out_unlock:
+       up_write(&pvfs2_inode->xattr_sem);
+       return ret;
+}
+
+/*
+ * Tries to get a specified object's keys into a user-specified buffer of a
+ * given size.  Note that like the previous instances of xattr routines, this
+ * also allows you to pass in a NULL pointer and 0 size to probe the size for
+ * subsequent memory allocations. Thus our return value is always the size of
+ * all the keys unless there were errors in fetching the keys!
+ */
+ssize_t pvfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+       struct inode *inode = dentry->d_inode;
+       struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+       struct pvfs2_kernel_op_s *new_op;
+       __u64 token = PVFS_ITERATE_START;
+       ssize_t ret = -ENOMEM;
+       ssize_t total = 0;
+       ssize_t length = 0;
+       int count_keys = 0;
+       int key_size;
+       int i = 0;
+
+       if (size > 0 && buffer == NULL) {
+               gossip_err("%s: bogus NULL pointers\n", __func__);
+               return -EINVAL;
+       }
+       if (size < 0) {
+               gossip_err("Invalid size (%d)\n", (int)size);
+               return -EINVAL;
+       }
+
+       down_read(&pvfs2_inode->xattr_sem);
+       new_op = op_alloc(PVFS2_VFS_OP_LISTXATTR);
+       if (!new_op)
+               goto out_unlock;
+
+       if (buffer && size > 0)
+               memset(buffer, 0, size);
+
+try_again:
+       key_size = 0;
+       new_op->upcall.req.listxattr.refn = pvfs2_inode->refn;
+       new_op->upcall.req.listxattr.token = token;
+       new_op->upcall.req.listxattr.requested_count =
+           (size == 0) ? 0 : PVFS_MAX_XATTR_LISTLEN;
+       ret = service_operation(new_op, __func__,
+                               get_interruptible_flag(inode));
+       if (ret != 0)
+               goto done;
+
+       if (size == 0) {
+               /*
+                * This is a bit of a big upper limit, but I did not want to
+                * spend too much time getting this correct, since users end
+                * up allocating memory rather than us...
+                */
+               total = new_op->downcall.resp.listxattr.returned_count *
+                       PVFS_MAX_XATTR_NAMELEN;
+               goto done;
+       }
+
+       length = new_op->downcall.resp.listxattr.keylen;
+       if (length == 0)
+               goto done;
+
+       /*
+        * Check to see how much can be fit in the buffer. Fit only whole keys.
+        */
+       for (i = 0; i < new_op->downcall.resp.listxattr.returned_count; i++) {
+               if (total + new_op->downcall.resp.listxattr.lengths[i] > size)
+                       goto done;
+
+               /*
+                * Since many dumb programs try to setxattr() on our reserved
+                * xattrs this is a feeble attempt at defeating those by not
+                * listing them in the output of listxattr.. sigh
+                */
+               if (is_reserved_key(new_op->downcall.resp.listxattr.key +
+                                   key_size,
+                                   new_op->downcall.resp.
+                                       listxattr.lengths[i])) {
+                       gossip_debug(GOSSIP_XATTR_DEBUG, "Copying key %d -> %s\n",
+                                       i, new_op->downcall.resp.listxattr.key +
+                                               key_size);
+                       memcpy(buffer + total,
+                               new_op->downcall.resp.listxattr.key + key_size,
+                               new_op->downcall.resp.listxattr.lengths[i]);
+                       total += new_op->downcall.resp.listxattr.lengths[i];
+                       count_keys++;
+               } else {
+                       gossip_debug(GOSSIP_XATTR_DEBUG, "[RESERVED] key %d -> %s\n",
+                                       i, new_op->downcall.resp.listxattr.key +
+                                               key_size);
+               }
+               key_size += new_op->downcall.resp.listxattr.lengths[i];
+       }
+
+       /*
+        * Since the buffer was large enough, we might have to continue
+        * fetching more keys!
+        */
+       token = new_op->downcall.resp.listxattr.token;
+       if (token != PVFS_ITERATE_END)
+               goto try_again;
+
+done:
+       gossip_debug(GOSSIP_XATTR_DEBUG, "%s: returning %d"
+                    " [size of buffer %ld] (filled in %d keys)\n",
+                    __func__,
+                    ret ? (int)ret : (int)total,
+                    (long)size,
+                    count_keys);
+       op_release(new_op);
+       if (ret == 0)
+               ret = total;
+out_unlock:
+       up_read(&pvfs2_inode->xattr_sem);
+       return ret;
+}
+
+int pvfs2_xattr_set_default(struct dentry *dentry,
+                           const char *name,
+                           const void *buffer,
+                           size_t size,
+                           int flags,
+                           int handler_flags)
+{
+       return pvfs2_inode_setxattr(dentry->d_inode,
+                                   PVFS2_XATTR_NAME_DEFAULT_PREFIX,
+                                   name,
+                                   buffer,
+                                   size,
+                                   flags);
+}
+
+int pvfs2_xattr_get_default(struct dentry *dentry,
+                           const char *name,
+                           void *buffer,
+                           size_t size,
+                           int handler_flags)
+{
+       return pvfs2_inode_getxattr(dentry->d_inode,
+                                   PVFS2_XATTR_NAME_DEFAULT_PREFIX,
+                                   name,
+                                   buffer,
+                                   size);
+
+}
+
+static int pvfs2_xattr_set_trusted(struct dentry *dentry,
+                           const char *name,
+                           const void *buffer,
+                           size_t size,
+                           int flags,
+                           int handler_flags)
+{
+       return pvfs2_inode_setxattr(dentry->d_inode,
+                                   PVFS2_XATTR_NAME_TRUSTED_PREFIX,
+                                   name,
+                                   buffer,
+                                   size,
+                                   flags);
+}
+
+static int pvfs2_xattr_get_trusted(struct dentry *dentry,
+                           const char *name,
+                           void *buffer,
+                           size_t size,
+                           int handler_flags)
+{
+       return pvfs2_inode_getxattr(dentry->d_inode,
+                                   PVFS2_XATTR_NAME_TRUSTED_PREFIX,
+                                   name,
+                                   buffer,
+                                   size);
+}
+
+static struct xattr_handler pvfs2_xattr_trusted_handler = {
+       .prefix = PVFS2_XATTR_NAME_TRUSTED_PREFIX,
+       .get = pvfs2_xattr_get_trusted,
+       .set = pvfs2_xattr_set_trusted,
+};
+
+static struct xattr_handler pvfs2_xattr_default_handler = {
+       /*
+        * NOTE: this is set to be the empty string.
+        * so that all un-prefixed xattrs keys get caught
+        * here!
+        */
+       .prefix = PVFS2_XATTR_NAME_DEFAULT_PREFIX,
+       .get = pvfs2_xattr_get_default,
+       .set = pvfs2_xattr_set_default,
+};
+
+const struct xattr_handler *pvfs2_xattr_handlers[] = {
+       &posix_acl_access_xattr_handler,
+       &posix_acl_default_xattr_handler,
+       &pvfs2_xattr_trusted_handler,
+       &pvfs2_xattr_default_handler,
+       NULL
+};