From: Brad Peters <bpeters@redhat.com> Date: Thu, 24 Jul 2008 18:20:23 -0400 Subject: [ppc] RAS update for Cell Message-id: 20080724222023.17682.56389.sendpatchset@squad5-lp1.lab.bos.redhat.com O-Subject: [PATCH RHEL5.3] RAS update for Cell Bugzilla: 313731 RH-Acked-by: David Howells <dhowells@redhat.com> RHBZ#: ====== https://bugzilla.redhat.com/show_bug.cgi?id=313731 Description: =========== New Feature / Power Arch Only This patch adds support for investigating spus information after a kernel crash event, through kdump vmcore file. Implementation is based on xmon code, but the new functionality was kept independent from xmon. kABI Status: ============ No symbols were harmed. Brew: ===== Built on all platforms. http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1370193 Kernel binary rpm available at: =============================== http://people.redhat.com/bpeters/kernels/kernel-2.6.18-94.el5.94.el5.313731.ppc64.rpm Upstream Status: ================ I checked and this is in upstream 2.6.24.rc7 Test Status: ============ spu information successfully shown to be saved for kexec crash. Test results: ----------- crash> extend extensions/spu.so # loading extension ./extensions/spu.so: shared object loaded crash> spus NODE 0: ID SPUADDR SPUSTATUS CTXADDR CTXSTATE PID 0 c000000000a33a80 IDLE 0 - 0 1 c000000000a33e80 RUNNING c000000025206f80 RUNNABLE 3557 2 c000000000a34280 RUNNING c000000025207f80 RUNNABLE 3556 3 c000000000a34680 RUNNING c000000025208f80 RUNNABLE 3555 4 c00000002ff48a00 RUNNING c000000025209f80 RUNNABLE 3554 5 c00000002ff46a00 RUNNING c00000002520af80 RUNNABLE 3553 6 c00000002ff44e00 RUNNING c00000002520bf80 RUNNABLE 3552 7 c00000002ff44a00 RUNNING c00000003ecf9d80 RUNNABLE 3551 NODE 1: ID SPUADDR SPUSTATUS CTXADDR CTXSTATE PID 8 c00000002ff44600 RUNNING c00000003ecf3d80 RUNNABLE 3550 9 c00000002ff44200 RUNNING c000000020c79d00 RUNNABLE 3548 10 c00000002ff43e00 RUNNING c000000020c70d00 RUNNABLE 3549 11 c000000000a34a80 RUNNING c00000002ff57300 RUNNABLE 3547 12 c000000000a34e80 RUNNING c00000002ff5d300 RUNNABLE 3546 13 c000000000a35280 RUNNING c00000003080de80 RUNNABLE 3544 14 c000000000a35680 RUNNING c000000030800e80 RUNNABLE 3545 15 c000000000a35a80 RUNNING c00000003e370900 RUNNABLE 3543 crash> spurq # run queue is empty... crash> spuctx c00000003ecf3d80 Dumping context fields for spu_context c00000003ecf3d80: state = 0 prio = 120 local_store = 0xc0000000272158a0 rq = 0xc00000003ecf4748 name = spe node = 1 number = 8 pid = 3550 slb_replace = 0x1 mm = 0xc00000003e9f0500 timestamp = 0x101a8f9dc class_0_pending = 0 problem = 0xd000080080690000 priv2 = 0xd0000800806b0000 flags = 0x0 saved_mfc_sr1_RW = 0x3b saved_mfc_dar = 0x184c080 saved_mfc_dsisr = 0x0 saved_spu_runcntl_RW = 0x1 saved_spu_status_R = 0x9 saved_spu_npc_RW = 0x0crash> extend extensions/spu.so # loading extension ./extensions/spu.so: shared object loaded crash> spus NODE 0: ID SPUADDR SPUSTATUS CTXADDR CTXSTATE PID 0 c000000000a33a80 IDLE 0 - 0 1 c000000000a33e80 RUNNING c000000025206f80 RUNNABLE 3557 2 c000000000a34280 RUNNING c000000025207f80 RUNNABLE 3556 3 c000000000a34680 RUNNING c000000025208f80 RUNNABLE 3555 4 c00000002ff48a00 RUNNING c000000025209f80 RUNNABLE 3554 5 c00000002ff46a00 RUNNING c00000002520af80 RUNNABLE 3553 6 c00000002ff44e00 RUNNING c00000002520bf80 RUNNABLE 3552 7 c00000002ff44a00 RUNNING c00000003ecf9d80 RUNNABLE 3551 NODE 1: ID SPUADDR SPUSTATUS CTXADDR CTXSTATE PID 8 c00000002ff44600 RUNNING c00000003ecf3d80 RUNNABLE 3550 9 c00000002ff44200 RUNNING c000000020c79d00 RUNNABLE 3548 10 c00000002ff43e00 RUNNING c000000020c70d00 RUNNABLE 3549 11 c000000000a34a80 RUNNING c00000002ff57300 RUNNABLE 3547 12 c000000000a34e80 RUNNING c00000002ff5d300 RUNNABLE 3546 13 c000000000a35280 RUNNING c00000003080de80 RUNNABLE 3544 14 c000000000a35680 RUNNING c000000030800e80 RUNNABLE 3545 15 c000000000a35a80 RUNNING c00000003e370900 RUNNABLE 3543 crash> spurq # run queue is empty... crash> spuctx c00000003ecf3d80 Dumping context fields for spu_context c00000003ecf3d80: state = 0 prio = 120 local_store = 0xc0000000272158a0 rq = 0xc00000003ecf4748 name = spe node = 1 number = 8 pid = 3550 slb_replace = 0x1 mm = 0xc00000003e9f0500 timestamp = 0x101a8f9dc class_0_pending = 0 problem = 0xd000080080690000 priv2 = 0xd0000800806b0000 flags = 0x0 saved_mfc_sr1_RW = 0x3b saved_mfc_dar = 0x184c080 saved_mfc_dsisr = 0x0 saved_spu_runcntl_RW = 0x1 saved_spu_status_R = 0x9 saved_spu_npc_RW = 0x0 =============================================================== Brad Peters 1-978-392-1000 x 23183 IBM on-site partner. Proposed Patch: =============== This patch is based on 2.6.18-94.el5 diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 1af41f7..f1f602e 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -274,6 +274,72 @@ void crash_kexec_secondary(struct pt_regs *regs) cpus_in_sr = CPU_MASK_NONE; } #endif +#ifdef CONFIG_SPU_BASE + +#include <asm/spu.h> +#include <asm/spu_priv1.h> + +struct crash_spu_info { + struct spu *spu; + u32 saved_spu_runcntl_RW; + u32 saved_spu_status_R; + u32 saved_spu_npc_RW; + u64 saved_mfc_sr1_RW; + u64 saved_mfc_dar; + u64 saved_mfc_dsisr; +}; + +#define CRASH_NUM_SPUS 16 /* Enough for current hardware */ +static struct crash_spu_info crash_spu_info[CRASH_NUM_SPUS]; + +static void crash_kexec_stop_spus(void) +{ + struct spu *spu; + int i; + u64 tmp; + + for (i = 0; i < CRASH_NUM_SPUS; i++) { + if (!crash_spu_info[i].spu) + continue; + + spu = crash_spu_info[i].spu; + + crash_spu_info[i].saved_spu_runcntl_RW = + in_be32(&spu->problem->spu_runcntl_RW); + crash_spu_info[i].saved_spu_status_R = + in_be32(&spu->problem->spu_status_R); + crash_spu_info[i].saved_spu_npc_RW = + in_be32(&spu->problem->spu_npc_RW); + + crash_spu_info[i].saved_mfc_dar = spu_mfc_dar_get(spu); + crash_spu_info[i].saved_mfc_dsisr = spu_mfc_dsisr_get(spu); + tmp = spu_mfc_sr1_get(spu); + crash_spu_info[i].saved_mfc_sr1_RW = tmp; + + tmp &= ~MFC_STATE1_MASTER_RUN_CONTROL_MASK; + spu_mfc_sr1_set(spu, tmp); + + __delay(200); + } +} + +void crash_register_spus(struct list_head *list) +{ + struct spu *spu; + + list_for_each_entry(spu, list, full_list) { + if (spu->number >= CRASH_NUM_SPUS){ WARN_ON(1); continue; + } + + crash_spu_info[spu->number].spu = spu; + } +} + +#else +static inline void crash_kexec_stop_spus(void) +{ +} +#endif /* CONFIG_SPU_BASE */ void default_machine_crash_shutdown(struct pt_regs *regs) { @@ -309,6 +375,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs) crash_save_this_cpu(regs, crashing_cpu); crash_kexec_prepare_cpus(crashing_cpu); cpu_set(crashing_cpu, cpus_in_crash); + crash_kexec_stop_spus(); if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(1, 0); } diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index de45b16..c2c7bc4 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -668,6 +668,7 @@ static int __init init_spu_base(void) spu_init_affinity(); + crash_register_spus(&spu_full_list); return 0; out_unregister_sysdev_class: diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h index ea1e75d..ddd23db 100644 --- a/include/asm-powerpc/spu.h +++ b/include/asm-powerpc/spu.h @@ -217,6 +217,14 @@ static inline void crash_register_spus(struct list_head *list) } #endif +#ifdef CONFIG_KEXEC +void crash_register_spus(struct list_head *list); +#else +static inline void crash_register_spus(struct list_head *list) +{ +} +#endif + extern void spu_invalidate_slbs(struct spu *spu); extern void spu_associate_mm(struct spu *spu, struct mm_struct *mm);