drbd: on attach, enforce clean meta data
authorLars Ellenberg <lars.ellenberg@linbit.com>
Tue, 5 Jul 2011 18:59:26 +0000 (20:59 +0200)
committerPhilipp Reisner <philipp.reisner@linbit.com>
Thu, 8 Nov 2012 15:57:51 +0000 (16:57 +0100)
Detection of unclean shutdown has moved into user space.

The kernel code will, whenever it updates the meta data, mark it as
"unclean", and will refuse to attach to such unclean meta data.

"drbdadm up" now schedules "drbdmeta apply-al", which will apply
the activity log to the bitmap, and/or reinitialize it, if necessary,
as well as set a "clean" indicator flag.

This moves a bit code out of kernel space.
As a side effect, it also prevents some 8.3 module from accidentally
ignoring the 8.4 style activity log, if someone should downgrade,
whether on purpose, or accidentally because he changed kernel versions
without providing an 8.4 for the new kernel, and the new kernel comes
with in-tree 8.3.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
drivers/block/drbd/drbd_actlog.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_state.c
include/linux/drbd.h

index 58b5b61628fca454d507ebfda778156df68b1918..da8ffd54fc18cb594da3ca877b7122947521be5d 100644 (file)
@@ -462,265 +462,6 @@ w_al_write_transaction(struct drbd_work *w, int unused)
        return 0;
 }
 
-/* FIXME
- * reading of the activity log,
- * and potentially dirtying of the affected bitmap regions,
- * should be done from userland only.
- * DRBD would simply always attach with an empty activity log,
- * and refuse to attach to something that looks like a crashed primary.
- */
-
-/**
- * drbd_al_read_tr() - Read a single transaction from the on disk activity log
- * @mdev:      DRBD device.
- * @bdev:      Block device to read form.
- * @b:         pointer to an al_transaction.
- * @index:     On disk slot of the transaction to read.
- *
- * Returns -1 on IO error, 0 on checksum error and 1 upon success.
- */
-static int drbd_al_read_tr(struct drbd_conf *mdev,
-                          struct drbd_backing_dev *bdev,
-                          int index)
-{
-       struct al_transaction_on_disk *b = page_address(mdev->md_io_page);
-       sector_t sector;
-       u32 crc;
-
-       sector =  bdev->md.md_offset
-               + bdev->md.al_offset
-               + index * (MD_BLOCK_SIZE>>9);
-
-       /* Dont process error normally,
-        * as this is done before disk is attached! */
-       if (drbd_md_sync_page_io(mdev, bdev, sector, READ))
-               return -1;
-
-       if (!expect(b->magic == cpu_to_be32(DRBD_AL_MAGIC)))
-               return 0;
-
-       if (!expect(be16_to_cpu(b->n_updates) <= AL_UPDATES_PER_TRANSACTION))
-               return 0;
-
-       if (!expect(be16_to_cpu(b->context_size) <= DRBD_AL_EXTENTS_MAX))
-               return 0;
-
-       if (!expect(be16_to_cpu(b->context_start_slot_nr) < DRBD_AL_EXTENTS_MAX))
-               return 0;
-
-       crc = be32_to_cpu(b->crc32c);
-       b->crc32c = 0;
-       if (!expect(crc == crc32c(0, b, 4096)))
-               return 0;
-
-       return 1;
-}
-
-/**
- * drbd_al_read_log() - Restores the activity log from its on disk representation.
- * @mdev:      DRBD device.
- * @bdev:      Block device to read form.
- *
- * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
- */
-int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
-{
-       struct al_transaction_on_disk *b;
-       int i;
-       int rv;
-       int mx;
-       int active_extents = 0;
-       int transactions = 0;
-       int found_valid = 0;
-       int found_initialized = 0;
-       int from = 0;
-       int to = 0;
-       u32 from_tnr = 0;
-       u32 to_tnr = 0;
-       u32 cnr;
-
-       /* Note that this is expected to be called with a newly created,
-        * clean and all unused activity log of the "expected size".
-        */
-
-       /* lock out all other meta data io for now,
-        * and make sure the page is mapped.
-        */
-       b = drbd_md_get_buffer(mdev);
-       if (!b)
-               return 0;
-
-       /* Always use the full ringbuffer space for now.
-        * possible optimization: read in all of it,
-        * then scan the in-memory pages. */
-
-       mx = (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
-
-       /* Find the valid transaction in the log */
-       for (i = 0; i < mx; i++) {
-               rv = drbd_al_read_tr(mdev, bdev, i);
-               /* invalid data in that block */
-               if (rv == 0)
-                       continue;
-               if (be16_to_cpu(b->transaction_type) == AL_TR_INITIALIZED) {
-                       ++found_initialized;
-                       continue;
-               }
-
-               /* IO error */
-               if (rv == -1) {
-                       drbd_md_put_buffer(mdev);
-                       return 0;
-               }
-
-               cnr = be32_to_cpu(b->tr_number);
-               if (++found_valid == 1) {
-                       from = i;
-                       to = i;
-                       from_tnr = cnr;
-                       to_tnr = cnr;
-                       continue;
-               }
-
-               D_ASSERT(cnr != to_tnr);
-               D_ASSERT(cnr != from_tnr);
-               if ((int)cnr - (int)from_tnr < 0) {
-                       D_ASSERT(from_tnr - cnr + i - from == mx);
-                       from = i;
-                       from_tnr = cnr;
-               }
-               if ((int)cnr - (int)to_tnr > 0) {
-                       D_ASSERT(cnr - to_tnr == i - to);
-                       to = i;
-                       to_tnr = cnr;
-               }
-       }
-
-       if (!found_valid) {
-               if (found_initialized != mx)
-                       dev_warn(DEV, "No usable activity log found.\n");
-               drbd_md_put_buffer(mdev);
-               return 1;
-       }
-
-       /* Read the valid transactions.
-        * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
-       i = from;
-       while (1) {
-               struct lc_element *e;
-               unsigned j, n, slot, extent_nr;
-
-               rv = drbd_al_read_tr(mdev, bdev, i);
-               if (!expect(rv != 0))
-                       goto cancel;
-               if (rv == -1) {
-                       drbd_md_put_buffer(mdev);
-                       return 0;
-               }
-
-               /* deal with different transaction types.
-                * not yet implemented */
-               if (!expect(b->transaction_type == 0))
-                       goto cancel;
-
-               /* on the fly re-create/resize activity log?
-                * will be a special transaction type flag. */
-               if (!expect(be16_to_cpu(b->context_size) == mdev->act_log->nr_elements))
-                       goto cancel;
-               if (!expect(be16_to_cpu(b->context_start_slot_nr) < mdev->act_log->nr_elements))
-                       goto cancel;
-
-               /* We are the only user of the activity log right now,
-                * don't actually need to take that lock. */
-               spin_lock_irq(&mdev->al_lock);
-
-               /* first, apply the context, ... */
-               for (j = 0, slot = be16_to_cpu(b->context_start_slot_nr);
-                    j < AL_CONTEXT_PER_TRANSACTION &&
-                    slot < mdev->act_log->nr_elements; j++, slot++) {
-                       extent_nr = be32_to_cpu(b->context[j]);
-                       e = lc_element_by_index(mdev->act_log, slot);
-                       if (e->lc_number != extent_nr) {
-                               if (extent_nr != LC_FREE)
-                                       active_extents++;
-                               else
-                                       active_extents--;
-                       }
-                       lc_set(mdev->act_log, extent_nr, slot);
-               }
-
-               /* ... then apply the updates,
-                * which override the context information.
-                * drbd_al_read_tr already did the rangecheck
-                * on n <= AL_UPDATES_PER_TRANSACTION */
-               n = be16_to_cpu(b->n_updates);
-               for (j = 0; j < n; j++) {
-                       slot = be16_to_cpu(b->update_slot_nr[j]);
-                       extent_nr = be32_to_cpu(b->update_extent_nr[j]);
-                       if (!expect(slot < mdev->act_log->nr_elements))
-                               break;
-                       e = lc_element_by_index(mdev->act_log, slot);
-                       if (e->lc_number != extent_nr) {
-                               if (extent_nr != LC_FREE)
-                                       active_extents++;
-                               else
-                                       active_extents--;
-                       }
-                       lc_set(mdev->act_log, extent_nr, slot);
-               }
-               spin_unlock_irq(&mdev->al_lock);
-
-               transactions++;
-
-cancel:
-               if (i == to)
-                       break;
-               i++;
-               if (i >= mx)
-                       i = 0;
-       }
-
-       mdev->al_tr_number = to_tnr+1;
-       mdev->al_tr_pos = (to + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
-
-       /* ok, we are done with it */
-       drbd_md_put_buffer(mdev);
-
-       dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
-            transactions, active_extents);
-
-       return 1;
-}
-
-/**
- * drbd_al_apply_to_bm() - Sets the bitmap to dirty(1) where covered by active AL extents
- * @mdev:      DRBD device.
- */
-void drbd_al_apply_to_bm(struct drbd_conf *mdev)
-{
-       unsigned int enr;
-       unsigned long add = 0;
-       char ppb[10];
-       int i, tmp;
-
-       wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
-
-       for (i = 0; i < mdev->act_log->nr_elements; i++) {
-               enr = lc_element_by_index(mdev->act_log, i)->lc_number;
-               if (enr == LC_FREE)
-                       continue;
-               tmp = drbd_bm_ALe_set_all(mdev, enr);
-               dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
-               add += tmp;
-       }
-
-       lc_unlock(mdev->act_log);
-       wake_up(&mdev->al_wait);
-
-       dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
-            ppsize(ppb, Bit2KB(add)));
-}
-
 static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
 {
        int rv;
index 4e582058a7c99ad5056f6e7e1851bc879db2f52a..9d0d6d0fb8207eb753f67ee007de82af65e5aefd 100644 (file)
@@ -164,10 +164,6 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
 /* usual integer division */
 #define div_floor(A, B) ((A)/(B))
 
-/* drbd_meta-data.c (still in drbd_main.c) */
-/* 4th incarnation of the disk layout. */
-#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
-
 extern struct ratelimit_state drbd_ratelimit_state;
 extern struct idr minors; /* RCU, updates: genl_lock() */
 extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */
@@ -1560,7 +1556,6 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
 extern int drbd_rs_del_all(struct drbd_conf *mdev);
 extern void drbd_rs_failed_io(struct drbd_conf *mdev,
                sector_t sector, int size);
-extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
 extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
 extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
                int size, const char *file, const unsigned int line);
@@ -1570,7 +1565,6 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
                int size, const char *file, const unsigned int line);
 #define drbd_set_out_of_sync(mdev, sector, size) \
        __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
-extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
 extern void drbd_al_shrink(struct drbd_conf *mdev);
 
 /* drbd_nl.c */
index 15384986e4a477315bb4ecb42ac984797678fbe4..f1d696ab6e8399cffc55036dc5d07a39517e4774 100644 (file)
@@ -2932,7 +2932,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
        for (i = UI_CURRENT; i < UI_SIZE; i++)
                buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
        buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
-       buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+       buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
 
        buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
        buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
@@ -2967,11 +2967,12 @@ out:
  * @bdev:      Device from which the meta data should be read in.
  *
  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
- * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ * something goes wrong.
  */
 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 {
        struct meta_data_on_disk *buffer;
+       u32 magic, flags;
        int i, rv = NO_ERROR;
 
        if (!get_ldev_if_state(mdev, D_ATTACHING))
@@ -2989,8 +2990,20 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
                goto err;
        }
 
-       if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
-               dev_err(DEV, "Error while reading metadata, magic not found.\n");
+       magic = be32_to_cpu(buffer->magic);
+       flags = be32_to_cpu(buffer->flags);
+       if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+           (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+                       /* btw: that's Activity Log clean, not "all" clean. */
+               dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+               rv = ERR_MD_UNCLEAN;
+               goto err;
+       }
+       if (magic != DRBD_MD_MAGIC_08) {
+               if (magic == DRBD_MD_MAGIC_07) 
+                       dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+               else
+                       dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
                rv = ERR_MD_INVALID;
                goto err;
        }
@@ -3035,11 +3048,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
        }
        spin_unlock_irq(&mdev->tconn->req_lock);
 
-       /* This blocks wants to be get removed... */
-       bdev->disk_conf->al_extents = be32_to_cpu(buffer->al_nr_extents);
-       if (bdev->disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
-               bdev->disk_conf->al_extents = DRBD_AL_EXTENTS_DEF;
-
  err:
        drbd_md_put_buffer(mdev);
  out:
index bf8d0b077624eb038b51d85b796f052b5c14703a..b39f5dc0f47bfcdec0cc1d6ade87c52d468481b8 100644 (file)
@@ -1267,7 +1267,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        union drbd_state ns, os;
        enum drbd_state_rv rv;
        struct net_conf *nc;
-       int cp_discovered = 0;
 
        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
        if (!adm_ctx.reply_skb)
@@ -1477,11 +1476,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                goto force_diskless_dec;
        }
 
-       if (!drbd_al_read_log(mdev, nbc)) {
-               retcode = ERR_IO_MD_DISK;
-               goto force_diskless_dec;
-       }
-
        /* Reset the "barriers don't work" bits here, then force meta data to
         * be written, to ensure we determine if barriers are supported. */
        if (new_disk_conf->md_flushes)
@@ -1511,10 +1505,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                clear_bit(CRASHED_PRIMARY, &mdev->flags);
 
        if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
-           !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod)) {
+           !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod))
                set_bit(CRASHED_PRIMARY, &mdev->flags);
-               cp_discovered = 1;
-       }
 
        mdev->send_cnt = 0;
        mdev->recv_cnt = 0;
@@ -1566,15 +1558,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                }
        }
 
-       if (cp_discovered) {
-               drbd_al_apply_to_bm(mdev);
-               if (drbd_bitmap_io(mdev, &drbd_bm_write,
-                       "crashed primary apply AL", BM_LOCKED_MASK)) {
-                       retcode = ERR_IO_MD_DISK;
-                       goto force_diskless_dec;
-               }
-       }
-
        if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
                drbd_suspend_al(mdev); /* IO is still suspended here... */
 
index f51cefdbeff334c8df964d1f4d075a0db8aee5d0..c4d0d96d7906228b0bea0ed646445e64b3da5fd2 100644 (file)
@@ -1017,6 +1017,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
                                                 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
                                                 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
 
+               mdf &= ~MDF_AL_CLEAN;
                if (test_bit(CRASHED_PRIMARY, &mdev->flags))
                        mdf |= MDF_CRASHED_PRIMARY;
                if (mdev->state.role == R_PRIMARY ||
index 161cd414b03660db401a91ebb264cb60b231eac2..1e9f754b66acf0beba8a4466cade5751aed12f6f 100644 (file)
@@ -162,6 +162,7 @@ enum drbd_ret_code {
        ERR_INVALID_REQUEST     = 162,
        ERR_NEED_APV_100        = 163,
        ERR_NEED_ALLOW_TWO_PRI  = 164,
+       ERR_MD_UNCLEAN          = 165,
 
        /* insert new ones above this line */
        AFTER_LAST_ERR_CODE
@@ -321,7 +322,8 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv);
 #define MDF_FULL_SYNC          (1 << 3)
 #define MDF_WAS_UP_TO_DATE     (1 << 4)
 #define MDF_PEER_OUT_DATED     (1 << 5)
-#define MDF_CRASHED_PRIMARY     (1 << 6)
+#define MDF_CRASHED_PRIMARY    (1 << 6)
+#define MDF_AL_CLEAN           (1 << 7)
 
 enum drbd_uuid_index {
        UI_CURRENT,
@@ -341,10 +343,16 @@ enum drbd_timeout_flag {
 
 #define UUID_JUST_CREATED ((__u64)4)
 
+/* magic numbers used in meta data and network packets */
 #define DRBD_MAGIC 0x83740267
 #define DRBD_MAGIC_BIG 0x835a
 #define DRBD_MAGIC_100 0x8620ec20
 
+#define DRBD_MD_MAGIC_07   (DRBD_MAGIC+3)
+#define DRBD_MD_MAGIC_08   (DRBD_MAGIC+4)
+#define DRBD_MD_MAGIC_84_UNCLEAN       (DRBD_MAGIC+5)
+
+
 /* how I came up with this magic?
  * base64 decode "actlog==" ;) */
 #define DRBD_AL_MAGIC 0x69cb65a2