x86/ras/therm_throt: Do not log a fake MCE for thermal events
authorBorislav Petkov <bp@suse.de>
Mon, 23 Jan 2017 18:35:07 +0000 (19:35 +0100)
committerIngo Molnar <mingo@kernel.org>
Tue, 24 Jan 2017 08:14:53 +0000 (09:14 +0100)
We log a fake bank 128 MCE to note that we're handling a CPU thermal
event. However, this confuses people into thinking that their hardware
generates MCEs. Hijacking MCA for logging thermal events is a gross
misuse anyway and it shouldn't have been done in the first place. And
besides we have other means for dealing with thermal events which are
much more suitable.

So let's kill the MCE logging part.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Ashok Raj <ashok.raj@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170105213846.GA12024@gmail.com
Link: http://lkml.kernel.org/r/20170123183514.13356-3-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/include/asm/mce.h
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/therm_throt.c

index 5132f2a6c0a2424016eeebd6fb717da18bbd76cc..a09ed05725c213a3d486092269f26953dbfa1b61 100644 (file)
 
 #define MCE_OVERFLOW 0         /* bit 0 in flags means overflow */
 
-/* Software defined banks */
-#define MCE_EXTENDED_BANK      128
-#define MCE_THERMAL_BANK       (MCE_EXTENDED_BANK + 0)
-
 #define MCE_LOG_LEN 32
 #define MCE_LOG_SIGNATURE      "MACHINECHECK"
 
@@ -306,8 +302,6 @@ extern void (*deferred_error_int_vector)(void);
 
 void intel_init_thermal(struct cpuinfo_x86 *c);
 
-void mce_log_therm_throt_event(__u64 status);
-
 /* Interrupt Handler for core thermal thresholds */
 extern int (*platform_thermal_notify)(__u64 msr_val);
 
index 00ef43233e034b0cde9b2adc88b8003ddc42d00b..6eef6fde0f024572440b82c3cb13c92a4db8ef38 100644 (file)
@@ -1331,31 +1331,6 @@ static void mce_process_work(struct work_struct *dummy)
        mce_gen_pool_process();
 }
 
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
-       struct mce m;
-
-       mce_setup(&m);
-       m.bank = MCE_THERMAL_BANK;
-       m.status = status;
-       mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
 /*
  * Periodic polling timer for "silent" machine check errors.  If the
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
index 465aca8be009ff21b9863191a5b248d39ec642c7..85469f84c9214027aab98273ac952a94291173b1 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Maintains a counter in /sys that keeps track of the number of thermal
  * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
+ * (since the logging to syslog is rate limited).
  *
  * Author: Dmitriy Zavin (dmitriyz@google.com)
  *
@@ -141,13 +141,8 @@ static struct attribute_group thermal_attr_group = {
  * IRQ has been acknowledged.
  *
  * It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- *              "timeout" from previous log message.
- *          1 : Event should be logged further, and a message has been
- *              printed to the syslog.
  */
-static int therm_throt_process(bool new_event, int event, int level)
+static void therm_throt_process(bool new_event, int event, int level)
 {
        struct _thermal_state *state;
        unsigned int this_cpu = smp_processor_id();
@@ -162,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level)
                else if (event == POWER_LIMIT_EVENT)
                        state = &pstate->core_power_limit;
                else
-                        return 0;
+                       return;
        } else if (level == PACKAGE_LEVEL) {
                if (event == THERMAL_THROTTLING_EVENT)
                        state = &pstate->package_throttle;
                else if (event == POWER_LIMIT_EVENT)
                        state = &pstate->package_power_limit;
                else
-                       return 0;
+                       return;
        } else
-               return 0;
+               return;
 
        old_event = state->new_event;
        state->new_event = new_event;
@@ -181,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 
        if (time_before64(now, state->next_check) &&
                        state->count != state->last_count)
-               return 0;
+               return;
 
        state->next_check = now + CHECK_INTERVAL;
        state->last_count = state->count;
@@ -193,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level)
                                this_cpu,
                                level == CORE_LEVEL ? "Core" : "Package",
                                state->count);
-               return 1;
+               return;
        }
        if (old_event) {
                if (event == THERMAL_THROTTLING_EVENT)
                        pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
                                level == CORE_LEVEL ? "Core" : "Package");
-               return 1;
+               return;
        }
-
-       return 0;
 }
 
 static int thresh_event_valid(int level, int event)
@@ -365,10 +358,9 @@ static void intel_thermal_interrupt(void)
        /* Check for violation of core thermal thresholds*/
        notify_thresholds(msr_val);
 
-       if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
-                               THERMAL_THROTTLING_EVENT,
-                               CORE_LEVEL) != 0)
-               mce_log_therm_throt_event(msr_val);
+       therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+                           THERMAL_THROTTLING_EVENT,
+                           CORE_LEVEL);
 
        if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
                therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,