From: Chris Lalancette <clalance@redhat.com> Date: Fri, 20 Mar 2009 10:23:18 +0100 Subject: [x86] use cpu_khz for loops_per_jiffy calculation Message-id: 49C36086.2090801@redhat.com O-Subject: [RHEL5.4 PATCH 9/14]: x86: use cpu_khz for loops_per_jiffy calculation Bugzilla: 463573 RH-Acked-by: Rik van Riel <riel@redhat.com> RH-Acked-by: Justin M. Forbes <jforbes@redhat.com> On the x86 platform we can use the value of tsc_khz computed during tsc calibration to calculate the loops_per_jiffy value. Its very important to keep the error in lpj values to minimum as any error in that may result in kernel panic in check_timer. In virtualization environment, On a highly overloaded host the guest delay calibration may sometimes result in errors beyond the ~50% that timer_irq_works can handle, resulting in the guest panicking. Does some formating changes to lpj_setup code to now have a single printk to print the bogomips value. We do this only for the boot processor because the AP's can have different base frequencies or the BIOS might boot a AP at a different frequency. This is actually 2 upstream commits: 3da757daf86e498872855f0b5e101f763ba79499 f3f3149f35b9195ef4b761b1353fc0766b5f53be Fixes BZ 463573 diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 9789209..473c226 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/dmi.h> #include <linux/acpi.h> +#include <linux/delay.h> #include <asm/delay.h> #include <asm/tsc.h> #include <asm/delay.h> @@ -205,6 +206,8 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); void tsc_init(void) { + u64 lpj; + if (!cpu_has_tsc || tsc_disable) return; @@ -219,6 +222,11 @@ void tsc_init(void) (unsigned long)cpu_khz % 1000); set_cyc2ns_scale(cpu_khz); + + lpj = ((u64)tsc_khz * 1000); + do_div(lpj, HZ); + lpj_fine = lpj; + use_tsc_delay(); } diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index fb9fcd9..b9f5303 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -35,6 +35,7 @@ #include <linux/kallsyms.h> #include <linux/efi.h> #include <linux/acpi.h> +#include <linux/delay.h> #ifdef CONFIG_ACPI #include <acpi/achware.h> /* for PM timer frequency */ #include <acpi/acpi_bus.h> @@ -1035,6 +1036,8 @@ void __init time_init(void) cpu_khz = tsc_khz; } + lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ; + vxtime.mode = VXTIME_TSC; vxtime.quot = (NSEC_PER_SEC << NS_SCALE) / vxtime_hz; vxtime.tsc_quot = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; diff --git a/include/linux/delay.h b/include/linux/delay.h index 5443e1f..7b91236 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -39,6 +39,7 @@ extern unsigned long loops_per_jiffy; #define ndelay(x) udelay(((x)+999)/1000) #endif +extern unsigned long lpj_fine; void calibrate_delay(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); diff --git a/init/calibrate.c b/init/calibrate.c index 06066a6..2785c54 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -7,9 +7,11 @@ #include <linux/sched.h> #include <linux/delay.h> #include <linux/init.h> +#include <linux/smp.h> #include <asm/timex.h> +unsigned long lpj_fine; unsigned long preset_lpj; static int __init lpj_setup(char *str) { @@ -33,9 +35,9 @@ static unsigned long __devinit calibrate_delay_direct(void) unsigned long pre_start, start, post_start; unsigned long pre_end, end, post_end; unsigned long start_jiffies; - unsigned long tsc_rate_min, tsc_rate_max; - unsigned long good_tsc_sum = 0; - unsigned long good_tsc_count = 0; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; unsigned long delay_calibration_ticks = ((REAL_HZ < 100) ? 1 : (REAL_HZ/100)); int i; @@ -80,25 +82,27 @@ static unsigned long __devinit calibrate_delay_direct(void) } read_current_timer(&post_end); - tsc_rate_max = (post_end - pre_start) / delay_calibration_ticks; - tsc_rate_min = (pre_end - post_start) / delay_calibration_ticks; + timer_rate_max = (post_end - pre_start) / + delay_calibration_ticks; + timer_rate_min = (pre_end - post_start) / + delay_calibration_ticks; - tsc_rate_max /= tick_divider; - tsc_rate_min /= tick_divider; + timer_rate_max /= tick_divider; + timer_rate_min /= tick_divider; /* - * If the upper limit and lower limit of the tsc_rate is + * If the upper limit and lower limit of the timer_rate is * >= 12.5% apart, redo calibration. */ if (pre_start != 0 && pre_end != 0 && - (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) { - good_tsc_count++; - good_tsc_sum += tsc_rate_max; + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; } } - if (good_tsc_count) - return (good_tsc_sum/good_tsc_count); + if (good_timer_count) + return (good_timer_sum/good_timer_count); printk(KERN_WARNING "calibrate_delay_direct() failed to get a good " "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n"); @@ -112,6 +116,10 @@ static unsigned long __devinit calibrate_delay_direct(void) {return 0;} * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little * better than 1% + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as + * the cpu frequency, hence do the calibration for those. */ #define LPS_PREC 8 @@ -122,20 +130,20 @@ void __devinit calibrate_delay(void) if (preset_lpj) { loops_per_jiffy = preset_lpj; - printk("Calibrating delay loop (skipped)... " - "%lu.%02lu BogoMIPS preset\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100); + printk(KERN_INFO + "Calibrating delay loop (skipped) preset value.. "); + } else if ((smp_processor_id() == 0) && lpj_fine) { + loops_per_jiffy = lpj_fine; + printk(KERN_INFO + "Calibrating delay loop (skipped), " + "value calculated using timer frequency.. "); } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { - printk("Calibrating delay using timer specific routine.. "); - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); + printk(KERN_INFO + "Calibrating delay using timer specific routine.. "); } else { loops_per_jiffy = (1<<12); - printk(KERN_DEBUG "Calibrating delay loop... "); + printk(KERN_INFO "Calibrating delay loop... "); while ((loops_per_jiffy <<= 1) != 0) { /* wait for "start of" clock tick */ ticks = jiffies; @@ -165,12 +173,8 @@ void __devinit calibrate_delay(void) if (jiffies != ticks) /* longer than 1 tick */ loops_per_jiffy &= ~loopbit; } - - /* Round the value and print it */ - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); } - + printk(KERN_CONT "%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); }