dm thin metadata: try to avoid ever aborting transactions
authorJoe Thornber <ejt@redhat.com>
Mon, 10 Sep 2018 15:50:09 +0000 (16:50 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 10 Oct 2018 06:53:21 +0000 (08:53 +0200)
[ Upstream commit 3ab91828166895600efd9cdc3a0eb32001f7204a ]

Committing a transaction can consume some metadata of it's own, we now
reserve a small amount of metadata to cover this.  Free metadata
reported by the kernel will not include this reserve.

If any of the reserve has been used after a commit we enter a new
internal state PM_OUT_OF_METADATA_SPACE.  This is reported as
PM_READ_ONLY, so no userland changes are needed.  If the metadata
device is resized the pool will move back to PM_WRITE.

These changes mean we never need to abort and rollback a transaction due
to running out of metadata space.  This is particularly important
because there have been a handful of reports of data corruption against
DM thin-provisioning that can all be attributed to the thin-pool having
ran out of metadata space.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/md/dm-thin-metadata.c
drivers/md/dm-thin.c

index e976f4f393341e5d44b3d8982a15684fe20b208c..7e0ccb6fee9dee11344763791a2ba36673dd6fc3 100644 (file)
@@ -189,6 +189,12 @@ struct dm_pool_metadata {
        unsigned long flags;
        sector_t data_block_size;
 
+       /*
+        * We reserve a section of the metadata for commit overhead.
+        * All reported space does *not* include this.
+        */
+       dm_block_t metadata_reserve;
+
        /*
         * Set if a transaction has to be aborted but the attempt to roll back
         * to the previous (good) transaction failed.  The only pool metadata
@@ -827,6 +833,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
        return dm_tm_commit(pmd->tm, sblock);
 }
 
+static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
+{
+       int r;
+       dm_block_t total;
+       dm_block_t max_blocks = 4096; /* 16M */
+
+       r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
+       if (r) {
+               DMERR("could not get size of metadata device");
+               pmd->metadata_reserve = max_blocks;
+       } else {
+               sector_div(total, 10);
+               pmd->metadata_reserve = min(max_blocks, total);
+       }
+}
+
 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
                                               sector_t data_block_size,
                                               bool format_device)
@@ -860,6 +882,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
                return ERR_PTR(r);
        }
 
+       __set_metadata_reserve(pmd);
+
        return pmd;
 }
 
@@ -1831,6 +1855,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
        down_read(&pmd->root_lock);
        if (!pmd->fail_io)
                r = dm_sm_get_nr_free(pmd->metadata_sm, result);
+
+       if (!r) {
+               if (*result < pmd->metadata_reserve)
+                       *result = 0;
+               else
+                       *result -= pmd->metadata_reserve;
+       }
        up_read(&pmd->root_lock);
 
        return r;
@@ -1943,8 +1974,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
        int r = -EINVAL;
 
        down_write(&pmd->root_lock);
-       if (!pmd->fail_io)
+       if (!pmd->fail_io) {
                r = __resize_space_map(pmd->metadata_sm, new_count);
+               if (!r)
+                       __set_metadata_reserve(pmd);
+       }
        up_write(&pmd->root_lock);
 
        return r;
index a952ad890f32b2e18da95548019266a67124edd0..81309d7836c5d31dfce67079e81725ac08519c89 100644 (file)
@@ -200,7 +200,13 @@ struct dm_thin_new_mapping;
 enum pool_mode {
        PM_WRITE,               /* metadata may be changed */
        PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
+
+       /*
+        * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
+        */
+       PM_OUT_OF_METADATA_SPACE,
        PM_READ_ONLY,           /* metadata may not be changed */
+
        PM_FAIL,                /* all I/O fails */
 };
 
@@ -1386,7 +1392,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
 
 static void requeue_bios(struct pool *pool);
 
-static void check_for_space(struct pool *pool)
+static bool is_read_only_pool_mode(enum pool_mode mode)
+{
+       return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
+}
+
+static bool is_read_only(struct pool *pool)
+{
+       return is_read_only_pool_mode(get_pool_mode(pool));
+}
+
+static void check_for_metadata_space(struct pool *pool)
+{
+       int r;
+       const char *ooms_reason = NULL;
+       dm_block_t nr_free;
+
+       r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
+       if (r)
+               ooms_reason = "Could not get free metadata blocks";
+       else if (!nr_free)
+               ooms_reason = "No free metadata blocks";
+
+       if (ooms_reason && !is_read_only(pool)) {
+               DMERR("%s", ooms_reason);
+               set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
+       }
+}
+
+static void check_for_data_space(struct pool *pool)
 {
        int r;
        dm_block_t nr_free;
@@ -1412,14 +1446,16 @@ static int commit(struct pool *pool)
 {
        int r;
 
-       if (get_pool_mode(pool) >= PM_READ_ONLY)
+       if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
                return -EINVAL;
 
        r = dm_pool_commit_metadata(pool->pmd);
        if (r)
                metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
-       else
-               check_for_space(pool);
+       else {
+               check_for_metadata_space(pool);
+               check_for_data_space(pool);
+       }
 
        return r;
 }
@@ -1485,6 +1521,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
                return r;
        }
 
+       r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
+       if (r) {
+               metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
+               return r;
+       }
+
+       if (!free_blocks) {
+               /* Let's commit before we use up the metadata reserve. */
+               r = commit(pool);
+               if (r)
+                       return r;
+       }
+
        return 0;
 }
 
@@ -1516,6 +1565,7 @@ static int should_error_unserviceable_bio(struct pool *pool)
        case PM_OUT_OF_DATA_SPACE:
                return pool->pf.error_if_no_space ? -ENOSPC : 0;
 
+       case PM_OUT_OF_METADATA_SPACE:
        case PM_READ_ONLY:
        case PM_FAIL:
                return -EIO;
@@ -2479,8 +2529,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                error_retry_list(pool);
                break;
 
+       case PM_OUT_OF_METADATA_SPACE:
        case PM_READ_ONLY:
-               if (old_mode != new_mode)
+               if (!is_read_only_pool_mode(old_mode))
                        notify_of_pool_mode_change(pool, "read-only");
                dm_pool_metadata_read_only(pool->pmd);
                pool->process_bio = process_bio_read_only;
@@ -3418,6 +3469,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
                DMINFO("%s: growing the metadata device from %llu to %llu blocks",
                       dm_device_name(pool->pool_md),
                       sb_metadata_dev_size, metadata_dev_size);
+
+               if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
+                       set_pool_mode(pool, PM_WRITE);
+
                r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
                if (r) {
                        metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
@@ -3721,7 +3776,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
 
-       if (get_pool_mode(pool) >= PM_READ_ONLY) {
+       if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
                DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
                      dm_device_name(pool->pool_md));
                return -EOPNOTSUPP;
@@ -3795,6 +3850,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
        dm_block_t nr_blocks_data;
        dm_block_t nr_blocks_metadata;
        dm_block_t held_root;
+       enum pool_mode mode;
        char buf[BDEVNAME_SIZE];
        char buf2[BDEVNAME_SIZE];
        struct pool_c *pt = ti->private;
@@ -3865,9 +3921,10 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                else
                        DMEMIT("- ");
 
-               if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+               mode = get_pool_mode(pool);
+               if (mode == PM_OUT_OF_DATA_SPACE)
                        DMEMIT("out_of_data_space ");
-               else if (pool->pf.mode == PM_READ_ONLY)
+               else if (is_read_only_pool_mode(mode))
                        DMEMIT("ro ");
                else
                        DMEMIT("rw ");