#else
#define xlog_recover_check_summary(log)
#endif
+STATIC int
+xlog_do_recovery_pass(
+ struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
/*
* This structure is used during recovery to record the buf log items which
return error;
}
+/*
+ * Seek forward in the log for log record headers.
+ *
+ * Given head and tail blocks, walk forward from the tail block until we find
+ * the provided number of records or hit the head block. The return value is the
+ * number of records encountered or a negative error code. The log block and
+ * buffer pointer of the last record seen are returned in rblk and rhead
+ * respectively.
+ */
+STATIC int
+xlog_seek_logrec_hdr(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int count,
+ struct xfs_buf *bp,
+ xfs_daddr_t *rblk,
+ struct xlog_rec_header **rhead,
+ bool *wrapped)
+{
+ int i;
+ int error;
+ int found = 0;
+ char *offset = NULL;
+ xfs_daddr_t end_blk;
+
+ *wrapped = false;
+
+ /*
+ * Walk forward from the tail block until we hit the head or the last
+ * block in the log.
+ */
+ end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
+ for (i = (int) tail_blk; i <= end_blk; i++) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+
+ /*
+ * If we haven't hit the head block or the log record header count,
+ * start looking again from the start of the physical log.
+ */
+ if (tail_blk > head_blk && found != count) {
+ for (i = 0; i < (int) head_blk; i++) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *)offset ==
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *wrapped = true;
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+ }
+
+ return found;
+
+out_error:
+ return error;
+}
+
+/*
+ * Check the log tail for torn writes. This is required when torn writes are
+ * detected at the head and the head had to be walked back to a previous record.
+ * The tail of the previous record must now be verified to ensure the torn
+ * writes didn't corrupt the previous tail.
+ *
+ * Return an error if CRC verification fails as recovery cannot proceed.
+ */
+STATIC int
+xlog_verify_tail(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ struct xlog_rec_header *thead;
+ struct xfs_buf *bp;
+ xfs_daddr_t first_bad;
+ int count;
+ int error = 0;
+ bool wrapped;
+ xfs_daddr_t tmp_head;
+
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return -ENOMEM;
+
+ /*
+ * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+ * a temporary head block that points after the last possible
+ * concurrently written record of the tail.
+ */
+ count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+ XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+ &wrapped);
+ if (count < 0) {
+ error = count;
+ goto out;
+ }
+
+ /*
+ * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+ * into the actual log head. tmp_head points to the start of the record
+ * so update it to the actual head block.
+ */
+ if (count < XLOG_MAX_ICLOGS + 1)
+ tmp_head = head_blk;
+
+ /*
+ * We now have a tail and temporary head block that covers at least
+ * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+ * records were completely written. Run a CRC verification pass from
+ * tail to head and return the result.
+ */
+ error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+
+out:
+ xlog_put_bp(bp);
+ return error;
+}
+
+/*
+ * Detect and trim torn writes from the head of the log.
+ *
+ * Storage without sector atomicity guarantees can result in torn writes in the
+ * log in the event of a crash. Our only means to detect this scenario is via
+ * CRC verification. While we can't always be certain that CRC verification
+ * failure is due to a torn write vs. an unrelated corruption, we do know that
+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
+ * the log and treat failures in this range as torn writes as a matter of
+ * policy. In the event of CRC failure, the head is walked back to the last good
+ * record in the log and the tail is updated from that record and verified.
+ */
+STATIC int
+xlog_verify_head(
+ struct xlog *log,
+ xfs_daddr_t *head_blk, /* in/out: unverified head */
+ xfs_daddr_t *tail_blk, /* out: tail block */
+ struct xfs_buf *bp,
+ xfs_daddr_t *rhead_blk, /* start blk of last record */
+ struct xlog_rec_header **rhead, /* ptr to last record */
+ bool *wrapped) /* last rec. wraps phys. log */
+{
+ struct xlog_rec_header *tmp_rhead;
+ struct xfs_buf *tmp_bp;
+ xfs_daddr_t first_bad;
+ xfs_daddr_t tmp_rhead_blk;
+ int found;
+ int error;
+ bool tmp_wrapped;
+
+ /*
+ * Search backwards through the log looking for the log record header
+ * block. This wraps all the way back around to the head so something is
+ * seriously wrong if we can't find it.
+ */
+ found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
+ rhead, wrapped);
+ if (found < 0)
+ return found;
+ if (!found) {
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ return -EIO;
+ }
+
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+
+ /*
+ * Now that we have a tail block, check the head of the log for torn
+ * writes. Search again until we hit the tail or the maximum number of
+ * log record I/Os that could have been in flight at one time. Use a
+ * temporary buffer so we don't trash the rhead/bp pointer from the
+ * call above.
+ */
+ tmp_bp = xlog_get_bp(log, 1);
+ if (!tmp_bp)
+ return -ENOMEM;
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
+ XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
+ &tmp_rhead, &tmp_wrapped);
+ xlog_put_bp(tmp_bp);
+ if (error < 0)
+ return error;
+
+ /*
+ * Now run a CRC verification pass over the records starting at the
+ * block found above to the current head. If a CRC failure occurs, the
+ * log block of the first bad record is saved in first_bad.
+ */
+ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+ if (error == -EFSBADCRC) {
+ /*
+ * We've hit a potential torn write. Reset the error and warn
+ * about it.
+ */
+ error = 0;
+ xfs_warn(log->l_mp,
+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
+ first_bad, *head_blk);
+
+ /*
+ * Get the header block and buffer pointer for the last good
+ * record before the bad record.
+ *
+ * Note that xlog_find_tail() clears the blocks at the new head
+ * (i.e., the records with invalid CRC) if the cycle number
+ * matches the the current cycle.
+ */
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
+ rhead_blk, rhead, wrapped);
+ if (found < 0)
+ return found;
+ if (found == 0) /* XXX: right thing to do here? */
+ return -EIO;
+
+ /*
+ * Reset the head block to the starting block of the first bad
+ * log record and set the tail block based on the last good
+ * record.
+ *
+ * Bail out if the updated head/tail match as this indicates
+ * possible corruption outside of the acceptable
+ * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
+ */
+ *head_blk = first_bad;
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+ if (*head_blk == *tail_blk) {
+ ASSERT(0);
+ return 0;
+ }
+
+ /*
+ * Now verify the tail based on the updated head. This is
+ * required because the torn writes trimmed from the head could
+ * have been written over the tail of a previous record. Return
+ * any errors since recovery cannot proceed if the tail is
+ * corrupt.
+ *
+ * XXX: This leaves a gap in truly robust protection from torn
+ * writes in the log. If the head is behind the tail, the tail
+ * pushes forward to create some space and then a crash occurs
+ * causing the writes into the previous record's tail region to
+ * tear, log recovery isn't able to recover.
+ *
+ * How likely is this to occur? If possible, can we do something
+ * more intelligent here? Is it safe to push the tail forward if
+ * we can determine that the tail is within the range of the
+ * torn write (e.g., the kernel can only overwrite the tail if
+ * it has actually been pushed forward)? Alternatively, could we
+ * somehow prevent this condition at runtime?
+ */
+ error = xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
+
+ return error;
+}
+
/*
* Find the sync block number or the tail of the log.
*
xlog_op_header_t *op_head;
char *offset = NULL;
xfs_buf_t *bp;
- int error, i, found;
+ int error;
xfs_daddr_t umount_data_blk;
xfs_daddr_t after_umount_blk;
+ xfs_daddr_t rhead_blk;
xfs_lsn_t tail_lsn;
int hblks;
bool wrapped = false;
}
/*
- * Search backwards through the log looking for the log record header
- * block. This wraps all the way back around to the head so something is
- * seriously wrong if we can't find it.
+ * Trim the head block back to skip over torn records. We can have
+ * multiple log I/Os in flight at any time, so we assume CRC failures
+ * back through the previous several records are torn writes and skip
+ * them.
*/
ASSERT(*head_blk < INT_MAX);
- found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, &i,
- &rhead, &wrapped);
- if (found < 0) {
- error = found;
+ error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
+ &rhead, &wrapped);
+ if (error)
goto done;
- }
- if (!found) {
- xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- xlog_put_bp(bp);
- ASSERT(0);
- return -EIO;
- }
- *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
/*
* Reset log values according to the state of the log when we
* written was complete and ended exactly on the end boundary
* of the physical log.
*/
- log->l_prev_block = i;
+ log->l_prev_block = rhead_blk;
log->l_curr_block = (int)*head_blk;
log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
if (wrapped)
} else {
hblks = 1;
}
- after_umount_blk = (i + hblks + (int)
- BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
+ after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+ after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
tail_lsn = atomic64_read(&log->l_tail_lsn);
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
- umount_data_blk = (i + hblks) % log->l_logBBsize;
+ umount_data_blk = rhead_blk + hblks;
+ umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
if (error)
goto done;