From: Prarit Bhargava <prarit@redhat.com> Subject: [RHEL 5.1 PATCH]: intel-rng: replace smp_call_function with stop_machine_run Date: Thu, 1 Mar 2007 08:17:00 -0500 Bugzilla: 227696 Message-Id: <20070301131700.24917.7152.sendpatchset@prarit.boston.redhat.com> Changelog: [misc] intel-rng: fix deadlock in smp_call_function The deadlock occurs because two CPUs are in contention over a rw_lock and the "call_function" puts the CPUs in such a state that no forward progress will be made until the calling CPU has completed it's code. Here's a more detailed example (sorry for the cut-and-paste): 1. CPU A has done read_lock(&lock), and has acquired the lock. 2. CPU B has done write_lock_irq(&lock) and is waiting for A to release the lock. CPU B has disabled interrupts while waiting for the interrupt: void __lockfunc _write_lock_irq(rwlock_t *lock) { local_irq_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); } 3. CPU C issues smp_call_function, as in the case of the intel-rng driver: set_mb(waitflag, 1); smp_call_function(intel_init_wait, NULL, 1, 0); ... // do some stuff with interrupts disabled ... set_mb(waitflag, 0); where static char __initdata waitflag; static void __init intel_init_wait(void *unused) { while (waitflag) cpu_relax(); } In this code the calling processor, C, has issued an IPI and disabled interrupts on every processor except itself. When each processor takes the IPI it runs intel_init_wait and waits in a tight loop until waitflag is zero. ie) no forward progress on any CPU. CPU C will not execute the code below the smp_call_function until all processors have started (not completed!) the IPI function. >From call_smp_function: cpus = num_online_cpus() - 1; ... /* Send a message to all other CPUs and wait for them to respond */ send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ while (atomic_read(&data.started) != cpus) cpu_relax(); So CPU C is waiting here. 4. CPU A, which holds the lock sees the IPI and is in the intel_init_wait code, happily waiting. CPU A has incremented data.started. CPU A will stay in this loop until CPU C sets waitflag = 0. 5. CPU B, if you recall is _waiting with interrupts disabled_ for CPU A to release the lock. It does not see the IPI because it has interrupts disabled It will not see the IPI until CPU A has released the lock. 6. CPU C is eventually only waiting for CPU B to do the final increment of data.started = cpus. CPU B is waiting for CPU A to release the lock. CPU A is executing a tight loop which it will not exit from until CPU C can set waitflag to zero. That's a 3-way deadlock. So, the issue is placing the other CPUs in a state that they do not make forward progress. The deadlock occurs before the calling CPU has disabled interrupts in the code in step 3. I also tested this code without the __init tags and explicitly coding waitflag=0 to avoid the gcc only setting .bss section variables to zero error that someone fixed last week. I also removed the code that disabled interrupts on the calling processor which had no effect -- at first I thought it was a simple interrupt issue ... Resolves BZ 226796. Tested successfully by Intel, Unisys, and by me. --- linux-2.6.18.x86_64.orig/drivers/char/hw_random/intel-rng.c 2007-02-23 06:55:34.000000000 -0500 +++ linux-2.6.18.x86_64/drivers/char/hw_random/intel-rng.c 2007-02-26 04:59:36.000000000 -0500 @@ -24,10 +24,11 @@ * warranty of any kind, whether express or implied. */ -#include <linux/module.h> +#include <linux/hw_random.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/pci.h> -#include <linux/hw_random.h> +#include <linux/stop_machine.h> #include <asm/io.h> @@ -217,30 +218,117 @@ static struct hwrng intel_rng = { .data_read = intel_rng_data_read, }; +struct intel_rng_hw { + struct pci_dev *dev; + void __iomem *mem; + u8 bios_cntl_off; + u8 bios_cntl_val; + u8 fwh_dec_en1_off; + u8 fwh_dec_en1_val; +}; + +static int __init intel_rng_hw_init(void *_intel_rng_hw) +{ + struct intel_rng_hw *intel_rng_hw = _intel_rng_hw; + u8 mfc, dvc; + + /* interrupts disabled in stop_machine_run call */ + + if (!(intel_rng_hw->fwh_dec_en1_val & FWH_F8_EN_MASK)) + pci_write_config_byte(intel_rng_hw->dev, + intel_rng_hw->fwh_dec_en1_off, + intel_rng_hw->fwh_dec_en1_val | + FWH_F8_EN_MASK); + if (!(intel_rng_hw->bios_cntl_val & BIOS_CNTL_WRITE_ENABLE_MASK)) + pci_write_config_byte(intel_rng_hw->dev, + intel_rng_hw->bios_cntl_off, + intel_rng_hw->bios_cntl_val | + BIOS_CNTL_WRITE_ENABLE_MASK); + + writeb(INTEL_FWH_RESET_CMD, intel_rng_hw->mem); + writeb(INTEL_FWH_READ_ID_CMD, intel_rng_hw->mem); + mfc = readb(intel_rng_hw->mem + INTEL_FWH_MANUFACTURER_CODE_ADDRESS); + dvc = readb(intel_rng_hw->mem + INTEL_FWH_DEVICE_CODE_ADDRESS); + writeb(INTEL_FWH_RESET_CMD, intel_rng_hw->mem); + + if (!(intel_rng_hw->bios_cntl_val & + (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK))) + pci_write_config_byte(intel_rng_hw->dev, + intel_rng_hw->bios_cntl_off, + intel_rng_hw->bios_cntl_val); + if (!(intel_rng_hw->fwh_dec_en1_val & FWH_F8_EN_MASK)) + pci_write_config_byte(intel_rng_hw->dev, + intel_rng_hw->fwh_dec_en1_off, + intel_rng_hw->fwh_dec_en1_val); + + if (mfc != INTEL_FWH_MANUFACTURER_CODE || + (dvc != INTEL_FWH_DEVICE_CODE_8M && + dvc != INTEL_FWH_DEVICE_CODE_4M)) { + printk(KERN_ERR PFX "FWH not detected\n"); + return -ENODEV; + } -#ifdef CONFIG_SMP -static char __initdata waitflag; + return 0; +} -static void __init intel_init_wait(void *unused) +static int __init intel_init_hw_struct(struct intel_rng_hw *intel_rng_hw, + struct pci_dev *dev) { - while (waitflag) - cpu_relax(); + intel_rng_hw->bios_cntl_val = 0xff; + intel_rng_hw->fwh_dec_en1_val = 0xff; + intel_rng_hw->dev = dev; + + /* Check for Intel 82802 */ + if (dev->device < 0x2640) { + intel_rng_hw->fwh_dec_en1_off = FWH_DEC_EN1_REG_OLD; + intel_rng_hw->bios_cntl_off = BIOS_CNTL_REG_OLD; + } else { + intel_rng_hw->fwh_dec_en1_off = FWH_DEC_EN1_REG_NEW; + intel_rng_hw->bios_cntl_off = BIOS_CNTL_REG_NEW; + } + + pci_read_config_byte(dev, intel_rng_hw->fwh_dec_en1_off, + &intel_rng_hw->fwh_dec_en1_val); + pci_read_config_byte(dev, intel_rng_hw->bios_cntl_off, + &intel_rng_hw->bios_cntl_val); + + if ((intel_rng_hw->bios_cntl_val & + (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK)) + == BIOS_CNTL_LOCK_ENABLE_MASK) { + static __initdata /*const*/ char warning[] = + KERN_WARNING PFX "Firmware space is locked read-only. " + KERN_WARNING PFX "If you can't or\n don't want to " + KERN_WARNING PFX "disable this in firmware setup, and " + KERN_WARNING PFX "if\n you are certain that your " + KERN_WARNING PFX "system has a functional\n RNG, try" + KERN_WARNING PFX "using the 'no_fwh_detect' option.\n"; + + if (no_fwh_detect) + return -ENODEV; + printk(warning); + return -EBUSY; + } + + intel_rng_hw->mem = ioremap_nocache(INTEL_FWH_ADDR, INTEL_FWH_ADDR_LEN); + if (intel_rng_hw->mem == NULL) + return -EBUSY; + + return 0; } -#endif + static int __init mod_init(void) { int err = -ENODEV; - unsigned i; + int i; struct pci_dev *dev = NULL; - void __iomem *mem; - unsigned long flags; - u8 bios_cntl_off, fwh_dec_en1_off; - u8 bios_cntl_val = 0xff, fwh_dec_en1_val = 0xff; - u8 hw_status, mfc, dvc; + void __iomem *mem = mem; + u8 hw_status; + struct intel_rng_hw *intel_rng_hw; for (i = 0; !dev && pci_tbl[i].vendor; ++i) - dev = pci_get_device(pci_tbl[i].vendor, pci_tbl[i].device, NULL); + dev = pci_get_device(pci_tbl[i].vendor, pci_tbl[i].device, + NULL); if (!dev) goto out; /* Device not found. */ @@ -250,39 +338,18 @@ static int __init mod_init(void) goto fwh_done; } - /* Check for Intel 82802 */ - if (dev->device < 0x2640) { - fwh_dec_en1_off = FWH_DEC_EN1_REG_OLD; - bios_cntl_off = BIOS_CNTL_REG_OLD; - } else { - fwh_dec_en1_off = FWH_DEC_EN1_REG_NEW; - bios_cntl_off = BIOS_CNTL_REG_NEW; - } - - pci_read_config_byte(dev, fwh_dec_en1_off, &fwh_dec_en1_val); - pci_read_config_byte(dev, bios_cntl_off, &bios_cntl_val); - - if ((bios_cntl_val & - (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK)) - == BIOS_CNTL_LOCK_ENABLE_MASK) { - static __initdata /*const*/ char warning[] = - KERN_WARNING PFX "Firmware space is locked read-only. If you can't or\n" - KERN_WARNING PFX "don't want to disable this in firmware setup, and if\n" - KERN_WARNING PFX "you are certain that your system has a functional\n" - KERN_WARNING PFX "RNG, try using the 'no_fwh_detect' option.\n"; - + intel_rng_hw = kmalloc(sizeof(*intel_rng_hw), GFP_KERNEL); + if (!intel_rng_hw) { pci_dev_put(dev); - if (no_fwh_detect) - goto fwh_done; - printk(warning); - err = -EBUSY; goto out; } - mem = ioremap_nocache(INTEL_FWH_ADDR, INTEL_FWH_ADDR_LEN); - if (mem == NULL) { + err = intel_init_hw_struct(intel_rng_hw, dev); + if (err == -ENODEV) { + pci_dev_put(dev); + goto fwh_done; + } else if (err < 0) { pci_dev_put(dev); - err = -EBUSY; goto out; } @@ -290,59 +357,17 @@ static int __init mod_init(void) * Since the BIOS code/data is going to disappear from its normal * location with the Read ID command, all activity on the system * must be stopped until the state is back to normal. + * + * Use stop_machine_run because IPIs can be blocked by disabling + * interrupts. */ -#ifdef CONFIG_SMP - set_mb(waitflag, 1); - if (smp_call_function(intel_init_wait, NULL, 1, 0) != 0) { - set_mb(waitflag, 0); - pci_dev_put(dev); - printk(KERN_ERR PFX "cannot run on all processors\n"); - err = -EAGAIN; - goto err_unmap; - } -#endif - local_irq_save(flags); - - if (!(fwh_dec_en1_val & FWH_F8_EN_MASK)) - pci_write_config_byte(dev, - fwh_dec_en1_off, - fwh_dec_en1_val | FWH_F8_EN_MASK); - if (!(bios_cntl_val & BIOS_CNTL_WRITE_ENABLE_MASK)) - pci_write_config_byte(dev, - bios_cntl_off, - bios_cntl_val | BIOS_CNTL_WRITE_ENABLE_MASK); - - writeb(INTEL_FWH_RESET_CMD, mem); - writeb(INTEL_FWH_READ_ID_CMD, mem); - mfc = readb(mem + INTEL_FWH_MANUFACTURER_CODE_ADDRESS); - dvc = readb(mem + INTEL_FWH_DEVICE_CODE_ADDRESS); - writeb(INTEL_FWH_RESET_CMD, mem); - - if (!(bios_cntl_val & - (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK))) - pci_write_config_byte(dev, bios_cntl_off, bios_cntl_val); - if (!(fwh_dec_en1_val & FWH_F8_EN_MASK)) - pci_write_config_byte(dev, fwh_dec_en1_off, fwh_dec_en1_val); - - local_irq_restore(flags); -#ifdef CONFIG_SMP - /* Tell other CPUs to resume. */ - set_mb(waitflag, 0); -#endif - - iounmap(mem); + err = stop_machine_run(intel_rng_hw_init, intel_rng_hw, NR_CPUS); pci_dev_put(dev); - - if (mfc != INTEL_FWH_MANUFACTURER_CODE || - (dvc != INTEL_FWH_DEVICE_CODE_8M && - dvc != INTEL_FWH_DEVICE_CODE_4M)) { - printk(KERN_ERR PFX "FWH not detected\n"); - err = -ENODEV; - goto out; - } + iounmap(intel_rng_hw->mem); + kfree(intel_rng_hw); + goto out; fwh_done: - err = -ENOMEM; mem = ioremap(INTEL_RNG_ADDR, INTEL_RNG_ADDR_LEN); if (!mem) @@ -352,22 +377,21 @@ fwh_done: /* Check for Random Number Generator */ err = -ENODEV; hw_status = hwstatus_get(mem); - if ((hw_status & INTEL_RNG_PRESENT) == 0) - goto err_unmap; + if ((hw_status & INTEL_RNG_PRESENT) == 0) { + iounmap(mem); + goto out; + } printk(KERN_INFO "Intel 82802 RNG detected\n"); err = hwrng_register(&intel_rng); if (err) { printk(KERN_ERR PFX "RNG registering failed (%d)\n", err); - goto err_unmap; + iounmap(mem); } out: return err; -err_unmap: - iounmap(mem); - goto out; } static void __exit mod_exit(void) --- linux-2.6.18.x86_64.orig/kernel/stop_machine.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.x86_64/kernel/stop_machine.c 2007-02-23 07:16:29.000000000 -0500 @@ -1,8 +1,9 @@ -#include <linux/stop_machine.h> -#include <linux/kthread.h> -#include <linux/sched.h> #include <linux/cpu.h> #include <linux/err.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/stop_machine.h> #include <linux/syscalls.h> #include <asm/atomic.h> #include <asm/semaphore.h> @@ -205,3 +206,4 @@ int stop_machine_run(int (*fn)(void *), return ret; } +EXPORT_SYMBOL_GPL(stop_machine_run);