x86/mce: Factor out and deprecate the /dev/mcelog driver
authorTony Luck <tony.luck@intel.com>
Mon, 27 Mar 2017 09:33:03 +0000 (11:33 +0200)
committerIngo Molnar <mingo@kernel.org>
Tue, 28 Mar 2017 06:55:01 +0000 (08:55 +0200)
Move all code relating to /dev/mcelog to a separate source file.
/dev/mcelog driver can now operate from the machine check notifier with
lowest prio.

Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Move the mce_helper and trigger functionality behind CONFIG_X86_MCELOG_LEGACY. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170327093304.10683-6-bp@alien8.de
[ Renamed CONFIG_X86_MCELOG to CONFIG_X86_MCELOG_LEGACY. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/Kconfig
arch/x86/include/asm/mce.h
arch/x86/kernel/cpu/mcheck/Makefile
arch/x86/kernel/cpu/mcheck/dev-mcelog.c [new file with mode: 0644]
arch/x86/kernel/cpu/mcheck/mce-internal.h
arch/x86/kernel/cpu/mcheck/mce.c

index cc98d5a294eee25c56209d92e38bdb08f379780d..43e6bac6b9501873aab6c2dcb2e18790620d1dcb 100644 (file)
@@ -1043,6 +1043,14 @@ config X86_MCE
          The action the kernel takes depends on the severity of the problem,
          ranging from warning messages to halting the machine.
 
+config X86_MCELOG_LEGACY
+       bool "Support for deprecated /dev/mcelog character device"
+       depends on X86_MCE
+       ---help---
+         Enable support for /dev/mcelog which is needed by the old mcelog
+         userspace logging daemon. Consider switching to the new generation
+         rasdaemon solution.
+
 config X86_MCE_INTEL
        def_bool y
        prompt "Intel MCE features"
@@ -1072,7 +1080,7 @@ config X86_MCE_THRESHOLD
        def_bool y
 
 config X86_MCE_INJECT
-       depends on X86_MCE && X86_LOCAL_APIC
+       depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
        tristate "Machine check injector support"
        ---help---
          Provide support for injecting machine checks for testing purposes.
index c5ae545d27d85a1cf299f74cfd0731f2f59a2b02..4fd5195deed01836a092dccf680baf6b3e150713 100644 (file)
@@ -196,6 +196,7 @@ enum mce_notifier_prios {
        MCE_PRIO_EXTLOG         = INT_MAX - 2,
        MCE_PRIO_NFIT           = INT_MAX - 3,
        MCE_PRIO_EDAC           = INT_MAX - 4,
+       MCE_PRIO_MCELOG         = 1,
        MCE_PRIO_LOWEST         = 0,
 };
 
index a3311c88619463b005bcde39bd043d61dbf20954..43051f0777d44924e69d5d64eb121d9423d4fe91 100644 (file)
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT)    += mce-inject.o
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
 
 obj-$(CONFIG_ACPI_APEI)                += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY)        += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644 (file)
index 0000000..9c632cb
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+#define mce_log_get_idx_check(p) \
+({ \
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+                        !lockdep_is_held(&mce_chrdev_read_mutex), \
+                        "suspicious mce_log_get_idx_check() usage"); \
+       smp_load_acquire(&(p)); \
+})
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+       .signature      = MCE_LOG_SIGNATURE,
+       .len            = MCE_LOG_LEN,
+       .recordlen      = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char                    mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+                               void *data)
+{
+       struct mce *mce = (struct mce *)data;
+       unsigned int next, entry;
+
+       wmb();
+       for (;;) {
+               entry = mce_log_get_idx_check(mcelog.next);
+               for (;;) {
+
+                       /*
+                        * When the buffer fills up discard new entries.
+                        * Assume that the earlier errors are the more
+                        * interesting ones:
+                        */
+                       if (entry >= MCE_LOG_LEN) {
+                               set_bit(MCE_OVERFLOW,
+                                       (unsigned long *)&mcelog.flags);
+                               return NOTIFY_OK;
+                       }
+                       /* Old left over entry. Skip: */
+                       if (mcelog.entry[entry].finished) {
+                               entry++;
+                               continue;
+                       }
+                       break;
+               }
+               smp_rmb();
+               next = entry + 1;
+               if (cmpxchg(&mcelog.next, entry, next) == entry)
+                       break;
+       }
+       memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+       wmb();
+       mcelog.entry[entry].finished = 1;
+       wmb();
+
+       /* wake processes polling /dev/mcelog */
+       wake_up_interruptible(&mce_chrdev_wait);
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+       .notifier_call  = dev_mce_log,
+       .priority       = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+       call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+       if (mce_helper[0])
+               schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+       strcpy(buf, mce_helper);
+       strcat(buf, "\n");
+       return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+                               const char *buf, size_t siz)
+{
+       char *p;
+
+       strncpy(mce_helper, buf, sizeof(mce_helper));
+       mce_helper[sizeof(mce_helper)-1] = 0;
+       p = strchr(mce_helper, '\n');
+
+       if (p)
+               *p = 0;
+
+       return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count;      /* #times opened */
+static int mce_chrdev_open_exclu;      /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_chrdev_state_lock);
+
+       if (mce_chrdev_open_exclu ||
+           (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+               spin_unlock(&mce_chrdev_state_lock);
+
+               return -EBUSY;
+       }
+
+       if (file->f_flags & O_EXCL)
+               mce_chrdev_open_exclu = 1;
+       mce_chrdev_open_count++;
+
+       spin_unlock(&mce_chrdev_state_lock);
+
+       return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_chrdev_state_lock);
+
+       mce_chrdev_open_count--;
+       mce_chrdev_open_exclu = 0;
+
+       spin_unlock(&mce_chrdev_state_lock);
+
+       return 0;
+}
+
+static void collect_tscs(void *data)
+{
+       unsigned long *cpu_tsc = (unsigned long *)data;
+
+       cpu_tsc[smp_processor_id()] = rdtsc();
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+       int rc;
+       u64 record_id;
+       struct mce m;
+
+       if (usize < sizeof(struct mce))
+               return -EINVAL;
+
+       rc = apei_read_mce(&m, &record_id);
+       /* Error or no more MCE record */
+       if (rc <= 0) {
+               mce_apei_read_done = 1;
+               /*
+                * When ERST is disabled, mce_chrdev_read() should return
+                * "no record" instead of "no device."
+                */
+               if (rc == -ENODEV)
+                       return 0;
+               return rc;
+       }
+       rc = -EFAULT;
+       if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+               return rc;
+       /*
+        * In fact, we should have cleared the record after that has
+        * been flushed to the disk or sent to network in
+        * /sbin/mcelog, but we have no interface to support that now,
+        * so just clear it to avoid duplication.
+        */
+       rc = apei_clear_mce(record_id);
+       if (rc) {
+               mce_apei_read_done = 1;
+               return rc;
+       }
+       *ubuf += sizeof(struct mce);
+
+       return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+                               size_t usize, loff_t *off)
+{
+       char __user *buf = ubuf;
+       unsigned long *cpu_tsc;
+       unsigned prev, next;
+       int i, err;
+
+       cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+       if (!cpu_tsc)
+               return -ENOMEM;
+
+       mutex_lock(&mce_chrdev_read_mutex);
+
+       if (!mce_apei_read_done) {
+               err = __mce_read_apei(&buf, usize);
+               if (err || buf != ubuf)
+                       goto out;
+       }
+
+       next = mce_log_get_idx_check(mcelog.next);
+
+       /* Only supports full reads right now */
+       err = -EINVAL;
+       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+               goto out;
+
+       err = 0;
+       prev = 0;
+       do {
+               for (i = prev; i < next; i++) {
+                       unsigned long start = jiffies;
+                       struct mce *m = &mcelog.entry[i];
+
+                       while (!m->finished) {
+                               if (time_after_eq(jiffies, start + 2)) {
+                                       memset(m, 0, sizeof(*m));
+                                       goto timeout;
+                               }
+                               cpu_relax();
+                       }
+                       smp_rmb();
+                       err |= copy_to_user(buf, m, sizeof(*m));
+                       buf += sizeof(*m);
+timeout:
+                       ;
+               }
+
+               memset(mcelog.entry + prev, 0,
+                      (next - prev) * sizeof(struct mce));
+               prev = next;
+               next = cmpxchg(&mcelog.next, prev, 0);
+       } while (next != prev);
+
+       synchronize_sched();
+
+       /*
+        * Collect entries that were still getting written before the
+        * synchronize.
+        */
+       on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+       for (i = next; i < MCE_LOG_LEN; i++) {
+               struct mce *m = &mcelog.entry[i];
+
+               if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+                       err |= copy_to_user(buf, m, sizeof(*m));
+                       smp_rmb();
+                       buf += sizeof(*m);
+                       memset(m, 0, sizeof(*m));
+               }
+       }
+
+       if (err)
+               err = -EFAULT;
+
+out:
+       mutex_unlock(&mce_chrdev_read_mutex);
+       kfree(cpu_tsc);
+
+       return err ? err : buf - ubuf;
+}
+
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+       poll_wait(file, &mce_chrdev_wait, wait);
+       if (READ_ONCE(mcelog.next))
+               return POLLIN | POLLRDNORM;
+       if (!mce_apei_read_done && apei_check_mce())
+               return POLLIN | POLLRDNORM;
+       return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+                               unsigned long arg)
+{
+       int __user *p = (int __user *)arg;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+       case MCE_GET_RECORD_LEN:
+               return put_user(sizeof(struct mce), p);
+       case MCE_GET_LOG_LEN:
+               return put_user(MCE_LOG_LEN, p);
+       case MCE_GETCLEAR_FLAGS: {
+               unsigned flags;
+
+               do {
+                       flags = mcelog.flags;
+               } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+               return put_user(flags, p);
+       }
+       default:
+               return -ENOTTY;
+       }
+}
+
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+                           size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+                            const char __user *ubuf,
+                            size_t usize, loff_t *off))
+{
+       mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+                               size_t usize, loff_t *off)
+{
+       if (mce_write)
+               return mce_write(filp, ubuf, usize, off);
+       else
+               return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+       .open                   = mce_chrdev_open,
+       .release                = mce_chrdev_release,
+       .read                   = mce_chrdev_read,
+       .write                  = mce_chrdev_write,
+       .poll                   = mce_chrdev_poll,
+       .unlocked_ioctl         = mce_chrdev_ioctl,
+       .llseek                 = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+       MISC_MCELOG_MINOR,
+       "mcelog",
+       &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+       int err;
+
+       /* register character device /dev/mcelog */
+       err = misc_register(&mce_chrdev_device);
+       if (err) {
+               pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+               return err;
+       }
+       mce_register_decode_chain(&dev_mcelog_nb);
+       return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
index 903043e6a62b36a2c395c7aab52da6f85e34fc11..7f2a7c54391f45e11e00c9f40fc86ef30288a5f1 100644 (file)
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
                m1->addr != m2->addr ||
                m1->misc != m2->misc;
 }
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+extern void mce_work_trigger(void);
+#else
+static inline void mce_work_trigger(void)      { }
+#endif
index 4a907758a51658e7e14949d764e1535c52d57f49..36082c7fe4a0f10a3161984b9fc3a24e6be96eba 100644 (file)
 
 #include "mce-internal.h"
 
-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-static int mce_chrdev_open_count;      /* #times opened */
-
-#define mce_log_get_idx_check(p) \
-({ \
-       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
-                        !lockdep_is_held(&mce_chrdev_read_mutex), \
-                        "suspicious mce_log_get_idx_check() usage"); \
-       smp_load_acquire(&(p)); \
-})
+static DEFINE_MUTEX(mce_log_mutex);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -89,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
        .monarch_timeout = -1
 };
 
-/* User mode helper program triggered by machine check event */
-static unsigned long           mce_need_notify;
-static char                    mce_helper[128];
-static char                    *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
 static DEFINE_PER_CPU(struct mce, mces_seen);
-static int                     cpu_missing;
+static unsigned long mce_need_notify;
+static int cpu_missing;
 
 /*
  * MCA banks polled by the period polling timer for corrected events.
@@ -147,18 +131,6 @@ void mce_setup(struct mce *m)
 DEFINE_PER_CPU(struct mce, injectm);
 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log_buffer mcelog_buf = {
-       .signature      = MCE_LOG_SIGNATURE,
-       .len            = MCE_LOG_LEN,
-       .recordlen      = sizeof(struct mce),
-};
-
 void mce_log(struct mce *m)
 {
        if (!mce_gen_pool_add(m))
@@ -167,9 +139,9 @@ void mce_log(struct mce *m)
 
 void mce_inject_log(struct mce *m)
 {
-       mutex_lock(&mce_chrdev_read_mutex);
+       mutex_lock(&mce_log_mutex);
        mce_log(m);
-       mutex_unlock(&mce_chrdev_read_mutex);
+       mutex_unlock(&mce_log_mutex);
 }
 EXPORT_SYMBOL_GPL(mce_inject_log);
 
@@ -582,7 +554,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
                              void *data)
 {
        struct mce *m = (struct mce *)data;
-       unsigned int next, entry;
 
        if (!m)
                return NOTIFY_DONE;
@@ -593,38 +564,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
        /* Emit the trace record: */
        trace_mce_record(m);
 
-       wmb();
-       for (;;) {
-               entry = mce_log_get_idx_check(mcelog_buf.next);
-               for (;;) {
-
-                       /*
-                        * When the buffer fills up discard new entries.
-                        * Assume that the earlier errors are the more
-                        * interesting ones:
-                        */
-                       if (entry >= MCE_LOG_LEN) {
-                               set_bit(MCE_OVERFLOW,
-                                       (unsigned long *)&mcelog_buf.flags);
-                               return NOTIFY_DONE;
-                       }
-                       /* Old left over entry. Skip: */
-                       if (mcelog_buf.entry[entry].finished) {
-                               entry++;
-                               continue;
-                       }
-                       break;
-               }
-               smp_rmb();
-               next = entry + 1;
-               if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
-                       break;
-       }
-       memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
-       wmb();
-       mcelog_buf.entry[entry].finished = 1;
-       wmb();
-
        set_bit(0, &mce_need_notify);
 
        mce_notify_irq();
@@ -669,10 +608,6 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
        if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
                return NOTIFY_DONE;
 
-       /* Don't print when mcelog is running */
-       if (mce_chrdev_open_count > 0)
-               return NOTIFY_DONE;
-
        __print_mce(m);
 
        return NOTIFY_DONE;
@@ -1456,13 +1391,6 @@ static void mce_timer_delete_all(void)
                del_timer_sync(&per_cpu(mce_timer, cpu));
 }
 
-static void mce_do_trigger(struct work_struct *work)
-{
-       call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
 /*
  * Notify the user(s) about new machine check events.
  * Can be called from interrupt context, but not from machine check/NMI
@@ -1474,11 +1402,7 @@ int mce_notify_irq(void)
        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 
        if (test_and_clear_bit(0, &mce_need_notify)) {
-               /* wake processes polling /dev/mcelog */
-               wake_up_interruptible(&mce_chrdev_wait);
-
-               if (mce_helper[0])
-                       schedule_work(&mce_trigger_work);
+               mce_work_trigger();
 
                if (__ratelimit(&ratelimit))
                        pr_info(HW_ERR "Machine check events logged\n");
@@ -1886,251 +1810,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
 
 }
 
-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_exclu;      /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_chrdev_state_lock);
-
-       if (mce_chrdev_open_exclu ||
-           (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
-               spin_unlock(&mce_chrdev_state_lock);
-
-               return -EBUSY;
-       }
-
-       if (file->f_flags & O_EXCL)
-               mce_chrdev_open_exclu = 1;
-       mce_chrdev_open_count++;
-
-       spin_unlock(&mce_chrdev_state_lock);
-
-       return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_chrdev_state_lock);
-
-       mce_chrdev_open_count--;
-       mce_chrdev_open_exclu = 0;
-
-       spin_unlock(&mce_chrdev_state_lock);
-
-       return 0;
-}
-
-static void collect_tscs(void *data)
-{
-       unsigned long *cpu_tsc = (unsigned long *)data;
-
-       cpu_tsc[smp_processor_id()] = rdtsc();
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
-       int rc;
-       u64 record_id;
-       struct mce m;
-
-       if (usize < sizeof(struct mce))
-               return -EINVAL;
-
-       rc = apei_read_mce(&m, &record_id);
-       /* Error or no more MCE record */
-       if (rc <= 0) {
-               mce_apei_read_done = 1;
-               /*
-                * When ERST is disabled, mce_chrdev_read() should return
-                * "no record" instead of "no device."
-                */
-               if (rc == -ENODEV)
-                       return 0;
-               return rc;
-       }
-       rc = -EFAULT;
-       if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
-               return rc;
-       /*
-        * In fact, we should have cleared the record after that has
-        * been flushed to the disk or sent to network in
-        * /sbin/mcelog, but we have no interface to support that now,
-        * so just clear it to avoid duplication.
-        */
-       rc = apei_clear_mce(record_id);
-       if (rc) {
-               mce_apei_read_done = 1;
-               return rc;
-       }
-       *ubuf += sizeof(struct mce);
-
-       return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
-                               size_t usize, loff_t *off)
-{
-       char __user *buf = ubuf;
-       unsigned long *cpu_tsc;
-       unsigned prev, next;
-       int i, err;
-
-       cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
-       if (!cpu_tsc)
-               return -ENOMEM;
-
-       mutex_lock(&mce_chrdev_read_mutex);
-
-       if (!mce_apei_read_done) {
-               err = __mce_read_apei(&buf, usize);
-               if (err || buf != ubuf)
-                       goto out;
-       }
-
-       next = mce_log_get_idx_check(mcelog_buf.next);
-
-       /* Only supports full reads right now */
-       err = -EINVAL;
-       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
-               goto out;
-
-       err = 0;
-       prev = 0;
-       do {
-               for (i = prev; i < next; i++) {
-                       unsigned long start = jiffies;
-                       struct mce *m = &mcelog_buf.entry[i];
-
-                       while (!m->finished) {
-                               if (time_after_eq(jiffies, start + 2)) {
-                                       memset(m, 0, sizeof(*m));
-                                       goto timeout;
-                               }
-                               cpu_relax();
-                       }
-                       smp_rmb();
-                       err |= copy_to_user(buf, m, sizeof(*m));
-                       buf += sizeof(*m);
-timeout:
-                       ;
-               }
-
-               memset(mcelog_buf.entry + prev, 0,
-                      (next - prev) * sizeof(struct mce));
-               prev = next;
-               next = cmpxchg(&mcelog_buf.next, prev, 0);
-       } while (next != prev);
-
-       synchronize_sched();
-
-       /*
-        * Collect entries that were still getting written before the
-        * synchronize.
-        */
-       on_each_cpu(collect_tscs, cpu_tsc, 1);
-
-       for (i = next; i < MCE_LOG_LEN; i++) {
-               struct mce *m = &mcelog_buf.entry[i];
-
-               if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
-                       err |= copy_to_user(buf, m, sizeof(*m));
-                       smp_rmb();
-                       buf += sizeof(*m);
-                       memset(m, 0, sizeof(*m));
-               }
-       }
-
-       if (err)
-               err = -EFAULT;
-
-out:
-       mutex_unlock(&mce_chrdev_read_mutex);
-       kfree(cpu_tsc);
-
-       return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
-{
-       poll_wait(file, &mce_chrdev_wait, wait);
-       if (READ_ONCE(mcelog_buf.next))
-               return POLLIN | POLLRDNORM;
-       if (!mce_apei_read_done && apei_check_mce())
-               return POLLIN | POLLRDNORM;
-       return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
-                               unsigned long arg)
-{
-       int __user *p = (int __user *)arg;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       switch (cmd) {
-       case MCE_GET_RECORD_LEN:
-               return put_user(sizeof(struct mce), p);
-       case MCE_GET_LOG_LEN:
-               return put_user(MCE_LOG_LEN, p);
-       case MCE_GETCLEAR_FLAGS: {
-               unsigned flags;
-
-               do {
-                       flags = mcelog_buf.flags;
-               } while (cmpxchg(&mcelog_buf.flags, flags, 0) != flags);
-
-               return put_user(flags, p);
-       }
-       default:
-               return -ENOTTY;
-       }
-}
-
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
-                           size_t usize, loff_t *off);
-
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
-                            const char __user *ubuf,
-                            size_t usize, loff_t *off))
-{
-       mce_write = fn;
-}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
-
-static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
-                               size_t usize, loff_t *off)
-{
-       if (mce_write)
-               return mce_write(filp, ubuf, usize, off);
-       else
-               return -EINVAL;
-}
-
-static const struct file_operations mce_chrdev_ops = {
-       .open                   = mce_chrdev_open,
-       .release                = mce_chrdev_release,
-       .read                   = mce_chrdev_read,
-       .write                  = mce_chrdev_write,
-       .poll                   = mce_chrdev_poll,
-       .unlocked_ioctl         = mce_chrdev_ioctl,
-       .llseek                 = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
-       MISC_MCELOG_MINOR,
-       "mcelog",
-       &mce_chrdev_ops,
-};
-
 static void __mce_disable_bank(void *arg)
 {
        int bank = *((int *)arg);
@@ -2349,29 +2028,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
        return size;
 }
 
-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
-       strcpy(buf, mce_helper);
-       strcat(buf, "\n");
-       return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
-                               const char *buf, size_t siz)
-{
-       char *p;
-
-       strncpy(mce_helper, buf, sizeof(mce_helper));
-       mce_helper[sizeof(mce_helper)-1] = 0;
-       p = strchr(mce_helper, '\n');
-
-       if (p)
-               *p = 0;
-
-       return strlen(mce_helper) + !!p;
-}
-
 static ssize_t set_ignore_ce(struct device *s,
                             struct device_attribute *attr,
                             const char *buf, size_t size)
@@ -2428,7 +2084,6 @@ static ssize_t store_int_with_restart(struct device *s,
        return ret;
 }
 
-static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2451,7 +2106,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
 static struct device_attribute *mce_device_attrs[] = {
        &dev_attr_tolerant.attr,
        &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
        &dev_attr_trigger,
+#endif
        &dev_attr_monarch_timeout.attr,
        &dev_attr_dont_log_ce.attr,
        &dev_attr_ignore_ce.attr,
@@ -2625,7 +2282,6 @@ static __init void mce_init_banks(void)
 
 static __init int mcheck_init_device(void)
 {
-       enum cpuhp_state hp_online;
        int err;
 
        if (!mce_available(&boot_cpu_data)) {
@@ -2653,21 +2309,11 @@ static __init int mcheck_init_device(void)
                                mce_cpu_online, mce_cpu_pre_down);
        if (err < 0)
                goto err_out_online;
-       hp_online = err;
 
        register_syscore_ops(&mce_syscore_ops);
 
-       /* register character device /dev/mcelog */
-       err = misc_register(&mce_chrdev_device);
-       if (err)
-               goto err_register;
-
        return 0;
 
-err_register:
-       unregister_syscore_ops(&mce_syscore_ops);
-       cpuhp_remove_state(hp_online);
-
 err_out_online:
        cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
 
@@ -2675,7 +2321,7 @@ err_out_mem:
        free_cpumask_var(mce_device_initialized);
 
 err_out:
-       pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+       pr_err("Unable to init MCE device (rc: %d)\n", err);
 
        return err;
 }