From: Brian Maly <bmaly@redhat.com> Subject: [RHEL5.1 patch] MCE thermal throttling Date: Wed, 06 Jun 2007 23:13:12 -0400 Bugzilla: 224187 Message-Id: <466777C8.8040206@redhat.com> Changelog: [x86] MCE thermal throttling Resolves BZ 224187 Currently in RHEL5, if the CPU temperatures rises above the set threshold, thermal warning messages spew to all consoles every few seconds. This has historically been the way thermal events have been handled in Linux, and while it works, it is less than ideal. A new mechanism to handle thermal events for Intel is now upstream. The new mechanism logs thermal events to the mcelog and sets up counters in /sys to keep track of the number of thermal throttle events. Users are still notified of important thermal events, but notification of such an event is now handled more sanely and cleanly. This patch is a backport from upstream. Its essentially the same patch I posted for RHEL4 for the same issue a while back. Testing efforts revealed no problems. Im currently having some cross vendor testing done, as it is essential this code work right. I want to ensure there are no regressions. Brian --- linux-2.6.18.noarch/arch/x86_64/kernel/mce.c.orig 2007-05-30 13:06:27.000000000 -0400 +++ linux-2.6.18.noarch/arch/x86_64/kernel/mce.c 2007-05-30 13:15:00.000000000 -0400 @@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * r atomic_dec(&mce_entry); } +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +{ + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + /* * Periodic polling timer for "silent" machine check errors. */ --- linux-2.6.18.noarch/arch/x86_64/kernel/Makefile.orig 2007-05-30 13:06:57.000000000 -0400 +++ linux-2.6.18.noarch/arch/x86_64/kernel/Makefile 2007-05-30 13:16:58.000000000 -0400 @@ -11,7 +11,7 @@ obj-y := process.o signal.o entry.o trap pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_X86_MCE) += mce.o +obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ @@ -47,6 +47,7 @@ obj-y += intel_cacheinfo.o CFLAGS_vsyscall.o := $(PROFILING) -g0 +therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/kernel/topology.o --- linux-2.6.18.noarch/arch/i386/kernel/cpu/mcheck/p4.c.orig 2007-05-30 13:08:06.000000000 -0400 +++ linux-2.6.18.noarch/arch/i386/kernel/cpu/mcheck/p4.c 2007-05-30 13:18:57.000000000 -0400 @@ -13,6 +13,8 @@ #include <asm/msr.h> #include <asm/apic.h> +#include <asm/therm_throt.h> + #include "mce.h" /* as supported by the P4/Xeon family */ @@ -44,25 +46,12 @@ static void unexpected_thermal_interrupt /* P4/Xeon Thermal transition interrupt handler */ static void intel_thermal_interrupt(struct pt_regs *regs) { - u32 l, h; - unsigned int cpu = smp_processor_id(); - static unsigned long next[NR_CPUS]; + __u64 msr_val; ack_APIC_irq(); - if (time_after(next[cpu], jiffies)) - return; - - next[cpu] = jiffies + HZ*5; - rdmsr(MSR_IA32_THERM_STATUS, l, h); - if (l & 0x1) { - printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); - printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", - cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); - } + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + therm_throt_process(msr_val & 0x1); } /* Thermal interrupt handler for this CPU setup */ @@ -126,6 +115,9 @@ static void intel_init_thermal(struct cp l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ --- linux-2.6.18.noarch/arch/i386/kernel/cpu/mcheck/Makefile.orig 2007-05-30 13:08:30.000000000 -0400 +++ linux-2.6.18.noarch/arch/i386/kernel/cpu/mcheck/Makefile 2007-05-30 13:19:43.000000000 -0400 @@ -1,2 +1,2 @@ -obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o +obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o --- linux-2.6.18.noarch/arch/i386/kernel/cpu/mcheck/therm_throt.c.orig 2007-05-30 13:08:59.000000000 -0400 +++ linux-2.6.18.noarch/arch/i386/kernel/cpu/mcheck/therm_throt.c 2007-05-30 13:20:41.000000000 -0400 @@ -0,0 +1,184 @@ +/* + * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c + * + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog and mcelog is rate limited). + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * Inspired by Ross Biro's and Al Borchers' counter code. + */ + +#include <linux/percpu.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <asm/cpu.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <asm/therm_throt.h> + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +atomic_t therm_throt_en = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_sysdev_show_func(count); +define_therm_throt_sysdev_one_ro(count); + +static struct attribute *thermal_throttle_attrs[] = { + &attr_count.attr, + NULL +}; + +static struct attribute_group thermal_throttle_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ + +/*** + * therm_throt_process - Process thermal throttling event from interrupt + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + * + * Returns: 0 : Event should NOT be further logged, i.e. still in + * "timeout" from previous log message. + * 1 : Event should be logged further, and a message has been + * printed to the syslog. + */ +int therm_throt_process(int curr) +{ + unsigned int cpu = smp_processor_id(); + __u64 tmp_jiffs = get_jiffies_64(); + + if (curr) + __get_cpu_var(thermal_throttle_count)++; + + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + return 0; + + __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + + /* if we just entered the thermal event */ + if (curr) { + printk(KERN_CRIT "CPU%d: Temperature above threshold, " + "cpu clock throttled (total events = %lu)\n", cpu, + __get_cpu_var(thermal_throttle_count)); + + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } + + return 1; +} + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device */ +static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) +{ + return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); +} + +static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +{ + return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); +} + +/* Mutex protecting device creation against CPU hotplug */ +static DEFINE_MUTEX(therm_cpu_lock); + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + int err; + + sys_dev = get_cpu_sysdev(cpu); + mutex_lock(&therm_cpu_lock); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + err = thermal_throttle_add_dev(sys_dev); + WARN_ON(err); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + thermal_throttle_remove_dev(sys_dev); + break; + } + mutex_unlock(&therm_cpu_lock); + return NOTIFY_OK; +} + +static struct notifier_block thermal_throttle_cpu_notifier = +{ + .notifier_call = thermal_throttle_cpu_callback, +}; + +static __init int thermal_throttle_init_device(void) +{ + unsigned int cpu = 0; + int err; + + if (!atomic_read(&therm_throt_en)) + return 0; + + register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + +#ifdef CONFIG_HOTPLUG_CPU + mutex_lock(&therm_cpu_lock); +#endif + /* connect live CPUs to sysfs */ + for_each_online_cpu(cpu) { + err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); + WARN_ON(err); + } +#ifdef CONFIG_HOTPLUG_CPU + mutex_unlock(&therm_cpu_lock); +#endif + + return 0; +} + +device_initcall(thermal_throttle_init_device); +#endif /* CONFIG_SYSFS */ --- linux-2.6.18.noarch/include/linux/notifier.h.orig 2007-05-30 13:47:29.000000000 -0400 +++ linux-2.6.18.noarch/include/linux/notifier.h 2007-05-30 13:50:30.000000000 -0400 @@ -154,5 +154,13 @@ extern int raw_notifier_call_chain(struc #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ +/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend + * operation in progress + */ +#define CPU_TASKS_FROZEN 0x0010 + +#define CPU_ONLINE_FROZEN (CPU_ONLINE | CPU_TASKS_FROZEN) +#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) + #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ --- linux-2.6.18.noarch/include/linux/jiffies.h.orig 2007-05-30 13:44:30.000000000 -0400 +++ linux-2.6.18.noarch/include/linux/jiffies.h 2007-05-30 13:46:07.000000000 -0400 @@ -126,6 +126,21 @@ static inline u64 get_jiffies_64(void) ((long)(a) - (long)(b) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) +/* Same as above, but does so with platform independent 64bit types. + * These must be used when utilizing jiffies_64 (i.e. return value of + * get_jiffies_64() */ +#define time_after64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(b) - (__s64)(a) < 0)) +#define time_before64(a,b) time_after64(b,a) + +#define time_after_eq64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(a) - (__s64)(b) >= 0)) +#define time_before_eq64(a,b) time_after_eq64(b,a) + /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. --- linux-2.6.18.noarch/include/asm-x86_64/therm_throt.h.orig 2007-05-30 13:10:04.000000000 -0400 +++ linux-2.6.18.noarch/include/asm-x86_64/therm_throt.h 2007-05-30 13:21:28.000000000 -0400 @@ -0,0 +1 @@ +#include <asm-i386/therm_throt.h> --- linux-2.6.18.noarch/include/asm-i386/therm_throt.h.orig 2007-05-30 13:10:14.000000000 -0400 +++ linux-2.6.18.noarch/include/asm-i386/therm_throt.h 2007-05-30 13:21:48.000000000 -0400 @@ -0,0 +1,9 @@ +#ifndef __ASM_I386_THERM_THROT_H__ +#define __ASM_I386_THERM_THROT_H__ 1 + +#include <asm/atomic.h> + +extern atomic_t therm_throt_en; +int therm_throt_process(int curr); + +#endif /* __ASM_I386_THERM_THROT_H__ */