[PATCH] Add tmpfs options for memory placement policies
authorRobin Holt <holt@sgi.com>
Sat, 14 Jan 2006 21:20:48 +0000 (13:20 -0800)
committerLinus Torvalds <torvalds@g5.osdl.org>
Sun, 15 Jan 2006 02:27:07 +0000 (18:27 -0800)
Anything that writes into a tmpfs filesystem is liable to disproportionately
decrease the available memory on a particular node.  Since there's no telling
what sort of application (e.g.  dd/cp/cat) might be dropping large files
there, this lets the admin choose the appropriate default behavior for their
site's situation.

Introduce a tmpfs mount option which allows specifying a memory policy and
a second option to specify the nodelist for that policy.  With the default
policy, tmpfs will behave as it does today.  This patch adds support for
preferred, bind, and interleave policies.

The default policy will cause pages to be added to tmpfs files on the node
which is doing the writing.  Some jobs expect a single process to create
and manage the tmpfs files.  This results in a node which has a
significantly reduced number of free pages.

With this patch, the administrator can specify the policy and nodes for
that policy where they would prefer allocations.

This patch was originally written by Brent Casavant and Hugh Dickins.  I
added support for the bind and preferred policies and the mpol_nodelist
mount option.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Documentation/filesystems/tmpfs.txt
fs/hugetlbfs/inode.c
include/linux/mempolicy.h
include/linux/shmem_fs.h
mm/mempolicy.c
mm/shmem.c

index 0d783c504eade32e666e7a19b6fa89f85bfb9416..dbe4d87d26154dfe422bd20b51551509c0a99ef5 100644 (file)
@@ -78,6 +78,18 @@ use up all the memory on the machine; but enhances the scalability of
 that instance in a system with many cpus making intensive use of it.
 
 
+tmpfs has a mount option to set the NUMA memory allocation policy for
+all files in that instance:
+mpol=interleave                prefers to allocate memory from each node in turn
+mpol=default           prefers to allocate memory from the local node
+mpol=bind              prefers to allocate from mpol_nodelist
+mpol=preferred         prefers to allocate from first node in mpol_nodelist
+
+The following mount option is used in conjunction with mpol=interleave,
+mpol=bind or mpol=preferred:
+mpol_nodelist: nodelist suitable for parsing with nodelist_parse.
+
+
 To specify the initial root directory you can use the following mount
 options:
 
index ab4c3a9d51b88948650d345378934fe912c8c440..f568102da1e8df3be5126e91a50914d55dc5c0bb 100644 (file)
@@ -402,7 +402,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                info = HUGETLBFS_I(inode);
-               mpol_shared_policy_init(&info->policy);
+               mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
                switch (mode & S_IFMT) {
                default:
                        init_special_inode(inode, mode, dev);
index c7ac77e873b3fa800e4c4d3a3db0bd43dcaf7192..d6a53ed6ab6c530c490ee43d5fd6a67faa70f48a 100644 (file)
@@ -132,12 +132,8 @@ struct shared_policy {
        spinlock_t lock;
 };
 
-static inline void mpol_shared_policy_init(struct shared_policy *info)
-{
-       info->root = RB_ROOT;
-       spin_lock_init(&info->lock);
-}
-
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+                               nodemask_t *nodes);
 int mpol_set_shared_policy(struct shared_policy *info,
                                struct vm_area_struct *vma,
                                struct mempolicy *new);
@@ -211,7 +207,8 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
        return -EINVAL;
 }
 
-static inline void mpol_shared_policy_init(struct shared_policy *info)
+static inline void mpol_shared_policy_init(struct shared_policy *info,
+                                       int policy, nodemask_t *nodes)
 {
 }
 
index c3e598276e78ea5d30990c0f4122e553352c4db2..c057f0b32318bdc40ea879086805e174db29c035 100644 (file)
@@ -26,6 +26,8 @@ struct shmem_sb_info {
        unsigned long free_blocks;  /* How many are left for allocation */
        unsigned long max_inodes;   /* How many inodes are allowed */
        unsigned long free_inodes;  /* How many are left for allocation */
+       int policy;                 /* Default NUMA memory alloc policy */
+       nodemask_t policy_nodes;    /* nodemask for preferred and bind */
        spinlock_t    stat_lock;
 };
 
index b62cab575a84bb241dad5c1c91717d97a97d2c4d..3171f884d2459a30ad113d9008b82b04d833671c 100644 (file)
@@ -1359,6 +1359,30 @@ restart:
        return 0;
 }
 
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+                               nodemask_t *policy_nodes)
+{
+       info->root = RB_ROOT;
+       spin_lock_init(&info->lock);
+
+       if (policy != MPOL_DEFAULT) {
+               struct mempolicy *newpol;
+
+               /* Falls back to MPOL_DEFAULT on any error */
+               newpol = mpol_new(policy, policy_nodes);
+               if (!IS_ERR(newpol)) {
+                       /* Create pseudo-vma that contains just the policy */
+                       struct vm_area_struct pvma;
+
+                       memset(&pvma, 0, sizeof(struct vm_area_struct));
+                       /* Policy covers entire file */
+                       pvma.vm_end = TASK_SIZE;
+                       mpol_set_shared_policy(info, &pvma, newpol);
+                       mpol_free(newpol);
+               }
+       }
+}
+
 int mpol_set_shared_policy(struct shared_policy *info,
                        struct vm_area_struct *vma, struct mempolicy *npol)
 {
index 343b3c0937e56712e6cd94bf6e764e23acb49143..ce501bce1c2e2369959666de3923e0085112bb73 100644 (file)
@@ -1316,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                case S_IFREG:
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
-                       mpol_shared_policy_init(&info->policy);
+                       mpol_shared_policy_init(&info->policy, sbinfo->policy,
+                                                       &sbinfo->policy_nodes);
                        break;
                case S_IFDIR:
                        inode->i_nlink++;
@@ -1330,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                         * Must not load anything in the rbtree,
                         * mpol_free_shared_policy will not be called.
                         */
-                       mpol_shared_policy_init(&info->policy);
+                       mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+                                               NULL);
                        break;
                }
        } else if (sbinfo->max_inodes) {
@@ -1843,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = {
        .put_link       = shmem_put_link,
 };
 
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid,
+       gid_t *gid, unsigned long *blocks, unsigned long *inodes,
+       int *policy, nodemask_t *policy_nodes)
 {
        char *this_char, *value, *rest;
 
@@ -1897,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
                        *gid = simple_strtoul(value,&rest,0);
                        if (*rest)
                                goto bad_val;
+               } else if (!strcmp(this_char,"mpol")) {
+                       if (!strcmp(value,"default"))
+                               *policy = MPOL_DEFAULT;
+                       else if (!strcmp(value,"preferred"))
+                               *policy = MPOL_PREFERRED;
+                       else if (!strcmp(value,"bind"))
+                               *policy = MPOL_BIND;
+                       else if (!strcmp(value,"interleave"))
+                               *policy = MPOL_INTERLEAVE;
+                       else
+                               goto bad_val;
+               } else if (!strcmp(this_char,"mpol_nodelist")) {
+                       nodelist_parse(value, *policy_nodes);
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
                               this_char);
@@ -1917,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        unsigned long max_blocks = sbinfo->max_blocks;
        unsigned long max_inodes = sbinfo->max_inodes;
+       int policy = sbinfo->policy;
+       nodemask_t policy_nodes = sbinfo->policy_nodes;
        unsigned long blocks;
        unsigned long inodes;
        int error = -EINVAL;
 
-       if (shmem_parse_options(data, NULL, NULL, NULL,
-                               &max_blocks, &max_inodes))
+       if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
+                               &max_inodes, &policy, &policy_nodes))
                return error;
 
        spin_lock(&sbinfo->stat_lock);
@@ -1948,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        sbinfo->free_blocks = max_blocks - blocks;
        sbinfo->max_inodes  = max_inodes;
        sbinfo->free_inodes = max_inodes - inodes;
+       sbinfo->policy = policy;
+       sbinfo->policy_nodes = policy_nodes;
 out:
        spin_unlock(&sbinfo->stat_lock);
        return error;
@@ -1972,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb,
        struct shmem_sb_info *sbinfo;
        unsigned long blocks = 0;
        unsigned long inodes = 0;
+       int policy = MPOL_DEFAULT;
+       nodemask_t policy_nodes = node_online_map;
 
 #ifdef CONFIG_TMPFS
        /*
@@ -1984,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb,
                inodes = totalram_pages - totalhigh_pages;
                if (inodes > blocks)
                        inodes = blocks;
-               if (shmem_parse_options(data, &mode, &uid, &gid,
-                                       &blocks, &inodes))
+               if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
+                                       &inodes, &policy, &policy_nodes))
                        return -EINVAL;
        }
 #else
@@ -2003,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb,
        sbinfo->free_blocks = blocks;
        sbinfo->max_inodes = inodes;
        sbinfo->free_inodes = inodes;
+       sbinfo->policy = policy;
+       sbinfo->policy_nodes = policy_nodes;
 
        sb->s_fs_info = sbinfo;
        sb->s_maxbytes = SHMEM_MAX_BYTES;