static DEFINE_MUTEX(watchdog_mutex);
-#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) \
+ || defined(CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU)
# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
# define NMI_WATCHDOG_DEFAULT 1
#else
int __read_mostly soft_watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
int __read_mostly nmi_watchdog_available;
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU)
+int __read_mostly watchdog_other_cpu_available = WATCHDOG_DEFAULT;
+#endif
struct cpumask watchdog_allowed_mask __read_mostly;
struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU)
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
* softlockup watchdog threads start and stop. The arch must select the
* SOFTLOCKUP_DETECTOR Kconfig.
*/
+
+#ifndef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
int __weak watchdog_nmi_enable(unsigned int cpu)
{
hardlockup_detector_perf_enable();
{
hardlockup_detector_perf_disable();
}
+#endif
/* Return 0, if a NMI watchdog is available. Error code otherwise */
int __weak __init watchdog_nmi_probe(void)
watchdog_enabled = 0;
if (!watchdog_user_enabled)
return;
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU)
+ if (watchdog_other_cpu_available && nmi_watchdog_user_enabled)
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+#endif
if (nmi_watchdog_available && nmi_watchdog_user_enabled)
watchdog_enabled |= NMI_WATCHDOG_ENABLED;
if (soft_watchdog_user_enabled)
static bool softlockup_threads_initialized __read_mostly;
static u64 __read_mostly sample_period;
+static unsigned long __read_mostly hardlockup_thresh;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(unsigned long, hardlockup_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
watchdog_update_hrtimer_threshold(sample_period);
+ hardlockup_thresh = sample_period * 3 / NSEC_PER_SEC;
}
/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
__this_cpu_write(watchdog_touch_ts, get_timestamp());
+ __this_cpu_write(hardlockup_touch_ts, get_timestamp());
}
/**
* entering idle state. This should only be used for scheduler events.
* Use touch_softlockup_watchdog() for everything else.
*/
-void touch_softlockup_watchdog_sched(void)
+notrace void touch_softlockup_watchdog_sched(void)
{
/*
* Preemption can be enabled. It doesn't matter which CPU's timestamp
raw_cpu_write(watchdog_touch_ts, 0);
}
-void touch_softlockup_watchdog(void)
+notrace void touch_softlockup_watchdog(void)
{
touch_softlockup_watchdog_sched();
wq_watchdog_touch(raw_smp_processor_id());
__this_cpu_write(watchdog_touch_ts, 0);
}
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static void watchdog_check_hardlockup_other_cpu(void);
+#else
+static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
+#endif
+
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ /* try to enable log_kevent of exynos-snapshot if log_kevent was off because of rcu stall */
+ dbg_snapshot_try_enable("log_kevent", NSEC_PER_SEC * 15);
if (!watchdog_enabled)
return HRTIMER_NORESTART;
/* kick the hardlockup detector */
watchdog_interrupt_count();
+ /* test for hardlockups on the next cpu */
+ watchdog_check_hardlockup_other_cpu();
+
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
nmi_watchdog_available = true;
lockup_detector_setup();
}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static cpumask_t __read_mostly watchdog_cpus;
+ATOMIC_NOTIFIER_HEAD(hardlockup_notifier_list);
+EXPORT_SYMBOL(hardlockup_notifier_list);
+
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) {
+ unsigned long now = get_timestamp();
+ unsigned long touch_ts = per_cpu(hardlockup_touch_ts, cpu);
+
+ if (time_after(now, touch_ts) &&
+ (now - touch_ts >= hardlockup_thresh))
+ return 1;
+ }
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+static void watchdog_check_hardlockup_other_cpu(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ smp_rmb();
+
+ if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+ per_cpu(watchdog_nmi_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_other_cpu(next_cpu)) {
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic) {
+ dbg_snapshot_set_hardlockup(hardlockup_panic);
+ atomic_notifier_call_chain(&hardlockup_notifier_list, 0, (void *)&next_cpu);
+ panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+ } else {
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+ }
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
+
+void touch_nmi_watchdog(void)
+{
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_nmi_touch, true);
+ arch_touch_nmi_watchdog();
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_nmi_touch, next_cpu) = true;
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+#endif