From: Prarit Bhargava <prarit@redhat.com> Date: Tue, 18 Aug 2009 10:54:08 -0400 Subject: [x86_64] fix gettimeoday TSC overflow issue - 1 Message-id: 4A8AC090.6080106@redhat.com O-Subject: [RHEL5 PATCH]: Fix gettimeoday() TSC overflow issue [v3] Bugzilla: 467942 RH-Acked-by: Rik van Riel <riel@redhat.com> RH-Acked-by: Chris Lalancette <clalance@redhat.com> RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com> A quick repost for dzickus. I had to update this patch for -148.el5 because of significant conflicts in that kernel. The patches that my patch conflicted with have since been removed and now [v2] will no longer apply properly. Refresh for -163.el5. Quick ACKs are appreciated -- I think those of you cc'd have already acked [v2]. P. diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index b1f1e22..c3b3ea0 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -78,6 +78,7 @@ static int notsc __initdata = 0; #define NSEC_PER_REAL_TICK (NSEC_PER_SEC / REAL_HZ) #define NS_SCALE 10 /* 2^10, carefully chosen */ +#define NS_SCALE_22 22 /* 2^22, carefully chosen for TSC */ #define US_SCALE 32 /* 2^32, arbitralrily chosen */ unsigned int cpu_khz; /* TSC clocks / usec, not used here */ @@ -120,7 +121,7 @@ static inline long do_gettimeoffset_tsc(void) t = get_cycles_sync(); if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> NS_SCALE; + x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> NS_SCALE_22; return x; } @@ -400,6 +401,11 @@ static void do_timer_account_lost_ticks(struct pt_regs *regs) { unsigned long tsc; int delay = 0, offset = 0, lost = 0, i; + long tsc_offset = 0; + + /* for re-calculate offset */ + long last_tsc_quot = vxtime.tsc_quot; + unsigned long last_tsc = vxtime.last_tsc; if (vxtime.hpet_address) offset = hpet_readl(HPET_COUNTER); @@ -443,27 +449,24 @@ static void do_timer_account_lost_ticks(struct pt_regs *regs) lost = pmtimer_mark_offset(); #endif } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> NS_SCALE) - NSEC_PER_REAL_TICK; + tsc_offset = (((tsc - vxtime.last_tsc) * + vxtime.tsc_quot) >> NS_SCALE_22) - + NSEC_PER_REAL_TICK; - if (offset < 0) - offset = 0; + if (tsc_offset < 0) + tsc_offset = 0; lost = 0; - while (offset > NSEC_PER_REAL_TICK) { + while (tsc_offset > NSEC_PER_REAL_TICK) { lost++; - offset -= NSEC_PER_REAL_TICK; + tsc_offset -= NSEC_PER_REAL_TICK; } /* FIXME: 1000 or 1000000? */ monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz; - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> NS_SCALE) < offset) - vxtime.last_tsc = tsc - - (((long) offset << NS_SCALE) / vxtime.tsc_quot) - 1; + vxtime.last_tsc = tsc - vxtime.quot * delay / + (vxtime.tsc_quot >> (NS_SCALE_22 - NS_SCALE)); } /* SCALE: We expect tick_divider - 1 lost, ie 0 for normal behaviour */ if (lost > (int)tick_divider - 1) { @@ -474,6 +477,16 @@ static void do_timer_account_lost_ticks(struct pt_regs *regs) /* Do the timer stuff */ for (i = 0; i < tick_divider; i++) do_timer_jiffy(regs); + + /* re-calculate vxtime.last_tsc */ + if (vxtime.mode != VXTIME_HPET && vxtime.mode != VXTIME_PMTMR) { + if (tsc > (last_tsc + tsc_khz)) { + vxtime.last_tsc = vxtime.last_tsc - + (tsc - last_tsc - + ((u64)tsc_khz * (u64)(lost + 1))) * + last_tsc_quot / vxtime.tsc_quot; + } + } } /* @@ -577,12 +590,12 @@ static unsigned int cyc2ns_scale __read_mostly; static inline void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE_22) / cpu_khz; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - return (cyc * cyc2ns_scale) >> NS_SCALE; + return (cyc * cyc2ns_scale) >> NS_SCALE_22; } unsigned long long sched_clock(void) @@ -800,7 +813,8 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; + vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE_22) / + tsc_khz; } set_cyc2ns_scale(tsc_khz_ref); @@ -1134,7 +1148,7 @@ void __init time_init(void) vxtime.mode = VXTIME_TSC; vxtime.quot = (NSEC_PER_SEC << NS_SCALE) / vxtime_hz; - vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; + vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE_22) / tsc_khz; vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); @@ -1230,7 +1244,7 @@ void time_init_gtod(void) printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); vxtime.quot = (NSEC_PER_SEC << NS_SCALE) / vxtime_hz; - vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE) / tsc_khz; + vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE_22) / tsc_khz; vxtime.last_tsc = get_cycles_sync(); set_cyc2ns_scale(tsc_khz); diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 69719d6..bb42909 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -53,6 +53,8 @@ int __vgetcpu_mode __section_vgetcpu_mode; ((v - fix_to_virt(VSYSCALL_FIRST_PAGE)) + __pa_symbol(&__vsyscall_0)); }) #define NS_SCALE 10 /* 2^10, carefully chosen */ +#define NS_SCALE_22 22 /* 2^22, carefully chosen for TSC*/ + static __always_inline void timeval_normalize(struct timeval * tv) { @@ -82,7 +84,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) if (t < __vxtime.last_tsc) t = __vxtime.last_tsc; nsec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> NS_SCALE; + __vxtime.tsc_quot) >> NS_SCALE_22; } else { nsec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - diff --git a/kernel/timer.c b/kernel/timer.c index 0e831b5..e7a3e51 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1294,6 +1294,22 @@ static void update_wall_time(void) xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; +#ifdef CONFIG_X86_64 +#define NS_SCALE_22 22 /* 2^22, carefully chosen */ + { +#ifdef CONFIG_XEN + unsigned int tsc_khz = cpu_khz; +#else + extern unsigned int tsc_khz; +#endif + /* re-calculate vxtime.tsc_quot */ + vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE_22) / tsc_khz * + (((s64)clock->xtime_interval + + (s64)clock->xtime_nsec) >> + clock->shift) / NSEC_PER_MSEC; + } +#endif + /* check to see if there is a new clocksource to use */ if (change_clocksource()) { clock->error = 0;