time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
authorJohn Stultz <john.stultz@linaro.org>
Thu, 11 Jun 2015 22:54:55 +0000 (15:54 -0700)
committerThomas Gleixner <tglx@linutronix.de>
Fri, 12 Jun 2015 09:15:49 +0000 (11:15 +0200)
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.

This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.

However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.

This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)

However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.

So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.

This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.

Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.

While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.

Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
include/linux/time64.h
include/linux/timekeeper_internal.h
kernel/time/ntp.c
kernel/time/ntp_internal.h
kernel/time/timekeeping.c

index 12d4e82b02765efa27b5ecf20779a1bd6f7d4f2c..77b5df2acd2adde021954ae8275b9f39d19c7c0f 100644 (file)
@@ -29,6 +29,7 @@ struct timespec64 {
 #define FSEC_PER_SEC   1000000000000000LL
 
 /* Located here for timespec[64]_valid_strict */
+#define TIME64_MAX                     ((s64)~((u64)1 << 63))
 #define KTIME_MAX                      ((s64)~((u64)1 << 63))
 #define KTIME_SEC_MAX                  (KTIME_MAX / NSEC_PER_SEC)
 
index e1f5a11365546a391f5dbb43d01c831fed9a5390..25247220b4b7ddf3f336077b105ab42271475a05 100644 (file)
@@ -50,6 +50,7 @@ struct tk_read_base {
  * @offs_tai:          Offset clock monotonic -> clock tai
  * @tai_offset:                The current UTC to TAI offset in seconds
  * @clock_was_set_seq: The sequence number of clock was set events
+ * @next_leap_ktime:   CLOCK_MONOTONIC time value of a pending leap-second
  * @raw_time:          Monotonic raw base time in timespec64 format
  * @cycle_interval:    Number of clock cycles in one NTP interval
  * @xtime_interval:    Number of clock shifted nano seconds in one NTP
@@ -90,6 +91,7 @@ struct timekeeper {
        ktime_t                 offs_tai;
        s32                     tai_offset;
        unsigned int            clock_was_set_seq;
+       ktime_t                 next_leap_ktime;
        struct timespec64       raw_time;
 
        /* The following members are for timekeeping internal use */
index 7aa216188450ed71b77c78c1be28916c14cdb7fe..033743e3647a6d626229451eb76aee1aa22edeec 100644 (file)
@@ -77,6 +77,9 @@ static long                   time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)   */
 static s64                     ntp_tick_adj;
 
+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t                        ntp_next_leap_sec = TIME64_MAX;
+
 #ifdef CONFIG_NTP_PPS
 
 /*
@@ -350,6 +353,7 @@ void ntp_clear(void)
        tick_length     = tick_length_base;
        time_offset     = 0;
 
+       ntp_next_leap_sec = TIME64_MAX;
        /* Clear PPS state variables */
        pps_clear();
 }
@@ -360,6 +364,21 @@ u64 ntp_tick_length(void)
        return tick_length;
 }
 
+/**
+ * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ *
+ * Provides the time of the next leapsecond against CLOCK_REALTIME in
+ * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ */
+ktime_t ntp_get_next_leap(void)
+{
+       ktime_t ret;
+
+       if ((time_state == TIME_INS) && (time_status & STA_INS))
+               return ktime_set(ntp_next_leap_sec, 0);
+       ret.tv64 = KTIME_MAX;
+       return ret;
+}
 
 /*
  * this routine handles the overflow of the microsecond field
@@ -383,15 +402,21 @@ int second_overflow(unsigned long secs)
         */
        switch (time_state) {
        case TIME_OK:
-               if (time_status & STA_INS)
+               if (time_status & STA_INS) {
                        time_state = TIME_INS;
-               else if (time_status & STA_DEL)
+                       ntp_next_leap_sec = secs + SECS_PER_DAY -
+                                               (secs % SECS_PER_DAY);
+               } else if (time_status & STA_DEL) {
                        time_state = TIME_DEL;
+                       ntp_next_leap_sec = secs + SECS_PER_DAY -
+                                                ((secs+1) % SECS_PER_DAY);
+               }
                break;
        case TIME_INS:
-               if (!(time_status & STA_INS))
+               if (!(time_status & STA_INS)) {
+                       ntp_next_leap_sec = TIME64_MAX;
                        time_state = TIME_OK;
-               else if (secs % SECS_PER_DAY == 0) {
+               else if (secs % SECS_PER_DAY == 0) {
                        leap = -1;
                        time_state = TIME_OOP;
                        printk(KERN_NOTICE
@@ -399,19 +424,21 @@ int second_overflow(unsigned long secs)
                }
                break;
        case TIME_DEL:
-               if (!(time_status & STA_DEL))
+               if (!(time_status & STA_DEL)) {
+                       ntp_next_leap_sec = TIME64_MAX;
                        time_state = TIME_OK;
-               else if ((secs + 1) % SECS_PER_DAY == 0) {
+               else if ((secs + 1) % SECS_PER_DAY == 0) {
                        leap = 1;
+                       ntp_next_leap_sec = TIME64_MAX;
                        time_state = TIME_WAIT;
                        printk(KERN_NOTICE
                                "Clock: deleting leap second 23:59:59 UTC\n");
                }
                break;
        case TIME_OOP:
+               ntp_next_leap_sec = TIME64_MAX;
                time_state = TIME_WAIT;
                break;
-
        case TIME_WAIT:
                if (!(time_status & (STA_INS | STA_DEL)))
                        time_state = TIME_OK;
@@ -548,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
        if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
                time_state = TIME_OK;
                time_status = STA_UNSYNC;
+               ntp_next_leap_sec = TIME64_MAX;
                /* restart PPS frequency calibration */
                pps_reset_freq_interval();
        }
index bbd102ad9df7c8fcc5df253faf6970d078ba9db9..65430504ca2630c31d185429e6a2573c817b43ae 100644 (file)
@@ -5,6 +5,7 @@ extern void ntp_init(void);
 extern void ntp_clear(void);
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
 extern u64 ntp_tick_length(void);
+extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(unsigned long secs);
 extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
index 849b93265904f8a5fb5e9d894a9b38f085c52890..5d67ffb7e3173159e29c062a911a6d4dbf95f302 100644 (file)
@@ -539,6 +539,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
+/*
+ * tk_update_leap_state - helper to update the next_leap_ktime
+ */
+static inline void tk_update_leap_state(struct timekeeper *tk)
+{
+       tk->next_leap_ktime = ntp_get_next_leap();
+       if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+               /* Convert to monotonic time */
+               tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
+}
+
 /*
  * Update the ktime_t based scalar nsec members of the timekeeper
  */
@@ -580,6 +591,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
                ntp_clear();
        }
 
+       tk_update_leap_state(tk);
        tk_update_ktime_data(tk);
 
        update_vsyscall(tk);
@@ -1956,15 +1968,22 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 
                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
+               base = ktime_add_ns(base, nsecs);
+
                if (*cwsseq != tk->clock_was_set_seq) {
                        *cwsseq = tk->clock_was_set_seq;
                        *offs_real = tk->offs_real;
                        *offs_boot = tk->offs_boot;
                        *offs_tai = tk->offs_tai;
                }
+
+               /* Handle leapsecond insertion adjustments */
+               if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+                       *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
+
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
-       return ktime_add_ns(base, nsecs);
+       return base;
 }
 
 /**
@@ -2006,6 +2025,8 @@ int do_adjtimex(struct timex *txc)
                __timekeeping_set_tai_offset(tk, tai);
                timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        }
+       tk_update_leap_state(tk);
+
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);